class Bot_RL_MLP (Bot): def __init__ (self, size_x = 3, size_y = 3, beta = 1, hidden = 20, learning_rate = 0.1, reward = [0, 1.0, -1.0], initial_field = [0], player_ID = 1): Bot.__init__(self) self.initial_field = initial_field self.player_ID = player_ID self.bot_name = "Bot_RL_MLP" self.version = 1 self.counter = 0 self.optimization = [] self.reward = reward[:] self.first_action = True self.beta = beta #hoher Wert für beta (50?): exploitation #niedriger Wert für beta : exploration self.mlp = MLP (size_x * size_y, hidden, size_x * size_y, learning_rate) self.new_game() """ Initializes a new game """ def new_game(self): self.first_action = True self.counter += 1 self.mlp.new_game() """ Loads """ def load_data(self, filename): fo = open(filename , "r") data = json.loads(fo.read()) fo.close() if (data["bot"] == self.bot_name): if (data["version"] <= self.version): self.player_ID = data["player_ID"] self.initial_field = data["initial_field"] self.counter = data["counter"] self.optimization = data["optimization"] self.reward = data["reward"] self.first_action = data["first_action"] self.beta = data["beta"] self.mlp.set_data(data["MLP"]) else: raise ValueError('dataset is not usable by Bot : different Bot identifier') else: raise ValueError('dataset is not usable by this Bot version : dataset version is higher than Bot version') return data """ Saves """ def save_data(self, filename): data = {"bot" : self.bot_name, "version" : self.version, "player_ID" : self.player_ID, "initial_field" : self.initial_field, "counter" : self.counter, "optimization" : self.optimization, "reward" : self.reward, "first_action" : self.first_action, "beta" : self.beta, "MLP" : self.mlp.get_data()} fo = open(filename , "w") fo.write(json.dumps(data)) fo.close() """ Returns an action depending on the given world """ def get_action(self, world_old): self.info_tic = world_old.get_sensor_info() self.h_tic = self.mlp.get_action(self.info_tic) #for i in range(len(self.h_tic)): # if (self.info_tic[i] > 0): # self.h_tic[i] = -100000 #Workaround: Wenn nur noch 1 Zug möglich ist, automatisch setzen moves = world_old.get_moves() if (len(moves) == 1): self.act_tic = moves[0] else: #Auswahl wiederholen bis ein gültiger Zug ausgewählt wurde validation = False while (validation == False): new_h_tic = [] for i in range(len(self.h_tic)): if (i in moves): new_h_tic.append(self.h_tic[i]) self.act_tic = moves[self.rand_winner (new_h_tic, self.beta)] # choose action #print self.info, self.act #print "----------\n",self.h_tic, "\n",moves, "\n",new_h_tic, "\n",self.act_tic x = self.act_tic % world_old.size_x y = self.act_tic / world_old.size_y validation = world_old.check_action(x, y) #Umrechnen 1D -> 2D x = self.act_tic % world_old.size_x y = self.act_tic / world_old.size_y #print "--------------------------" #print self.h, "->", self.act, "->", x, ",", y #print "--------------------------" return (x, y) """ Adapts the MLP considering the results (world_new) of its last action """ def evaluate_action(self, world_new): if (self.first_action == False): r = self.get_reward(world_new.get_winner()) # read reward #Berechnen der Q-Werte vor und nach der Aktion q0 = self.h[self.act] q1 = self.mlp.get_action(world_new.get_sensor_info())[self.act_tic] #Berechnen der Belohnung auf dem neuen Feld r = self.get_reward(world_new.get_winner()) # read reward if (r == self.get_reward(1)): # This is cleaner than defining target = r # target as r + 0.9 * q1, else: # because weights now converge. target = 0.9 * q1 # gamma = 0.9 delta = target - q0 # prediction error #Wichtig : nur das delta an der Position der Aktion wird als Fehler betrachtet, für alle anderen #Positionen ist der Fehler 0 error = np.zeros (self.mlp.input_size) error[self.act] = delta #Wichtig : Das Lernen erfolgt mittels des Fehlers und der Welt VOR der Aktion self.mlp.evaluate_action_RL(self.info, error) #print q0, q1, delta self.info = self.info_tic self.h = self.h_tic self.act = self.act_tic self.first_action = False """ Selects an action """ def rand_winner (self, S_from, beta): #for i in range (len(S_from)): # if S_from[i] > 200: # print S_from # time.sleep(0.2) #print "--------------------\n",S_from #time.sleep(0.2) sum = 0.0 p_i = 0.0 rnd = np.random.random() d_r = len (S_from) sel = 0 try: for i in range (d_r): sum += np.exp (beta * min(S_from[i],200)) #if field is empty, set reward to 1 for all fields #to get a probablity higher than 0 if (sum == 0): sum = d_r S_from = [1]*d_r for i in range (d_r): p_i += np.exp (beta * min(S_from[i],200)) / sum if p_i > rnd: sel = i rnd = 1.1 # out of reach, so the next will not be turned ON except Exception: print beta, S_from[i], S_from, sum return sel """ Calculates the reward for the actual board setup """ def get_reward (self, winner): if ((winner >= 0) and (winner <= 2)): return self.reward[int(winner)] else: return 0.0
class Bot_RL_MLP (Bot): def __init__ (self, size_x = 3, size_y = 3, beta = 1, hidden = 20, learning_rate = 0.1, reward = [0, 1.0, -1.0], initial_field = [0], player_ID = 1): Bot.__init__(self) self.initial_field = initial_field self.player_ID = player_ID self.bot_name = "Bot_RL_MLP" self.version = 2.0 self.counter = 0 self.optimization = [] self.reward = reward[:] self.first_action = True self.beta = beta #hoher Wert für beta (50?): exploitation #niedriger Wert für beta : exploration self.mlp = MLP (size_x * size_y, hidden, size_x * size_y, learning_rate) self.new_game() """ Initializes a new game """ def new_game(self): self.first_action = True self.counter += 1 self.mlp.new_game() """ Loads """ def load_data(self, filename): fo = open(filename , "r") data = json.loads(fo.read()) fo.close() if (data["bot"] == self.bot_name): if (data["version"] <= self.version): self.player_ID = data["player_ID"] self.initial_field = data["initial_field"] self.counter = data["counter"] self.optimization = data["optimization"] self.reward = data["reward"] self.first_action = data["first_action"] self.beta = data["beta"] self.mlp.set_data(data["MLP"]) else: raise ValueError('dataset is not usable by Bot : different Bot identifier') else: raise ValueError('dataset is not usable by this Bot version : dataset version is higher than Bot version') return data """ Saves """ def save_data(self, filename): data = {"bot" : self.bot_name, "version" : self.version, "player_ID" : self.player_ID, "initial_field" : self.initial_field, "counter" : self.counter, "optimization" : self.optimization, "reward" : self.reward, "first_action" : self.first_action, "beta" : self.beta, "MLP" : self.mlp.get_data()} fo = open(filename , "w") fo.write(json.dumps(data)) fo.close() def play_game(self, world, player, train_bot): world.new_init() self.mlp.new_game() I = world.get_sensor_info() #h = np.dot (w_mot, I) hidden, h = self.mlp.get_action(I) act = self.rand_winner (h, self.beta) # choose action act_vec = np.zeros (self.mlp.output_size) act_vec[act] = 1.0 #val = numpy.dot (w_mot[act], I) # value before action #val is q0 *** val = h[act] # value before action r = 0 #while r == 0: while (world.get_winner() < 0): if (world.active_player != player): (x, y) = train_bot.get_action(world) world.perform_action(x, y) else: x = act % world.size_x y = act / world.size_y world.perform_action(x, y) # do selected action if (world.get_winner() < 0): (x, y) = train_bot.get_action(world) world.perform_action(x, y) r = self.get_reward(world.get_winner()) # read reward I_tic = world.get_sensor_info() # read new state #numpy.dot (w_mot, I_tic) hidden_tic, h_tic = self.mlp.get_action(I_tic) act_tic = self.rand_winner (h_tic, self.beta) # choose next action act_vec = np.zeros (self.mlp.output_size) act_vec[act] = 1.0 #val_tic = numpy.dot (w_mot[act_tic], I_tic) # value after action #val_tic is q1 *** val_tic = h_tic[act_tic] if r == 1.0: # This is cleaner than defining target = r # target as r + 0.9 * val_tic, else: # because weights now converge. target = 0.9 * val_tic # gamma = 0.9 delta = target - val # prediction error #w_mot += 0.5 * delta * numpy.outer (act_vec, I) error = np.zeros (self.mlp.output_size) error[act] = delta #print error #Lernen *** self.mlp.evaluate_action_RL(h, hidden, error) I = I_tic val = val_tic act = act_tic hidden = hidden_tic """ Selects an action """ def rand_winner (self, S_from, beta): #for i in range (len(S_from)): # if S_from[i] > 200: # print S_from # time.sleep(0.2) #print "--------------------\n",S_from #time.sleep(0.2) sum = 0.0 p_i = 0.0 rnd = np.random.random() d_r = len (S_from) sel = 0 try: for i in range (d_r): sum += np.exp (beta * min(S_from[i],200)) #if field is empty, set reward to 1 for all fields #to get a probablity higher than 0 if (sum == 0): sum = d_r S_from = [1]*d_r for i in range (d_r): p_i += np.exp (beta * min(S_from[i],200)) / sum if p_i > rnd: sel = i rnd = 1.1 # out of reach, so the next will not be turned ON except Exception: print beta, S_from[i], S_from, sum return sel """ Calculates the reward for the actual board setup """ def get_reward (self, winner): if ((winner >= 0) and (winner <= 2)): return self.reward[int(winner)] else: return 0.0