class Agent: def __init__(self): self.n_games = 0 self.epsilon = 0.5 # randomness self.gamma = 0.9 # discount rate self.memory = deque(maxlen=MAX_MEMORY) # popleft() self.model = Linear_QNet(2, 256, 4) self.trainer = QTrainer(self.model, lr=LR, gamma=self.gamma) self.epsilon_decay_value = (self.epsilon) / (END_EPSILON_DECAYING - START_EPSILON_DECAYING) #TO DO def get_state(self, game): drone = game.drone state = [drone.x, drone.y] return np.array(state, dtype=int) # Random Moves: tradeoff exploration / exploitation def get_action(self, state, episode): if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING: self.epsilon -= self.epsilon_decay_value final_move = [0, 0, 0, 0] if random.randint(0, 200) < self.epsilon: move = random.randint(0, 3) final_move[move] = 1 else: state0 = torch.tensor(state, dtype=torch.float) prediction = self.model(state0) move = torch.argmax(prediction).item() final_move[move] = 1 return final_move #Storing Memory def remember(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) # popleft if MAX_MEMORY is reached #TO DO def train_long_memory(self): if len(self.memory) > BATCH_SIZE: mini_sample = random.sample(self.memory, BATCH_SIZE) # list of tuples else: mini_sample = self.memory states, actions, rewards, next_states, dones = zip(*mini_sample) self.trainer.train_step(states, actions, rewards, next_states, dones) # Updating Q Values def train_short_memory(self, state, action, reward, next_state, done): self.trainer.train_step(state, action, reward, next_state, done)
class Agent: def __init__(self): self.epsilion = 0.999 self.gamma = 0.9 self.memory = deque(maxlen=MAX_MEMORY) self.model = Linear_QNet(2, 256, 4) self.trainer = QTrainer(self.model, LR, self.gamma) self.epsilion_decay_value = 0.998 def get_state(self, game): # drone = game.drone # [game.drone_x, game.drone_y, game.man_x, game.man_y] state = [game.drone_x, game.drone_y] return np.array(state, dtype=int) def get_action(self, state, episode): self.epsilion *= self.epsilion_decay_value if np.random.random() < self.epsilion: # take random action move = np.random.randint(0, 4) return move else: state0 = torch.tensor(state, dtype=torch.float) prediction = self.model(state0) move = torch.argmax(prediction).item() return move def remember(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) def train_long_memory(self): if len(self.memory) > BATCH_SIZE: mini_sample = random.sample(self.memory, BATCH_SIZE) else: mini_sample = self.memory states, actions, rewards, next_states, dones = zip(*mini_sample) self.trainer.train_step(states, actions, rewards, next_states, dones) def train_short_memory(self, state, action, reward, next_state, done): self.trainer.train_step(state, action, reward, next_state, done)
class Agent: def __init__(self): self.memory = deque(maxlen=MAX_MEM) self.n_games: int = 0 self.epsilon = 0 self.gamma = 0.9 self.model = Q_Net(11, 256, 3) self.trainer = QTrainer(self.model, lr=LR, gamma=self.gamma) def remember(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) # popleft if MAX_MEMORY is reached def train_long_memory(self): if len(self.memory) > BATCH_SIZE: mini_sample = random.sample(self.memory, BATCH_SIZE) # list of tuples else: mini_sample = self.memory for state, action, reward, next_state, done in mini_sample: self.trainer.train_step(state, action, reward, next_state, done) def train_short_memory(self, state, action, reward, next_state, done): self.trainer.train_step(state, action, reward, next_state, done) def get_action(self, state): self.epsilon = 80 - self.n_games final_move = [0, 0, 0] if random.randint(0, 200) < self.epsilon: move = random.randint(0, 2) final_move[move] = 1 else: state0 = torch.tensor(state, dtype=torch.float) prediction = self.model(state0) move = torch.argmax(prediction).item() final_move[move] = 1 return final_move
class Agent: def __init__(self): self.number_of_games = 0 self.epsilon = 0 # randomness self.gamma = 0.8 # discount rate self.memory = deque(maxlen=MAX_MEMORY) self.model = LinearQNet(11, 256, 3) self.trainer = QTrainer(self.model, learning_rate=LR, gamma=self.gamma) def get_state(self, game): head = game.snake[0] point_left = Point(head.x - BLOCK_SIZE, head.y) point_right = Point(head.x + BLOCK_SIZE, head.y) point_up = Point(head.x, head.y - BLOCK_SIZE) point_down = Point(head.x, head.y + BLOCK_SIZE) direction_left = game.direction == Direction.LEFT direction_right = game.direction == Direction.RIGHT direction_up = game.direction == Direction.UP direction_down = game.direction == Direction.DOWN state = [ # Danger straight (direction_right and game.is_collision(point_right)) or (direction_left and game.is_collision(point_left)) or (direction_up and game.is_collision(point_up)) or (direction_down and game.is_collision(point_down)), # Danger right (direction_up and game.is_collision(point_right)) or (direction_down and game.is_collision(point_left)) or (direction_left and game.is_collision(point_up)) or (direction_right and game.is_collision(point_down)), # Danger left (direction_down and game.is_collision(point_right)) or (direction_up and game.is_collision(point_left)) or (direction_right and game.is_collision(point_up)) or (direction_left and game.is_collision(point_down)), # Move direction direction_left, direction_right, direction_up, direction_down, # Food location game.food.x < game.head.x, # food left game.food.x > game.head.x, # food right game.food.y < game.head.y, # food up game.food.y > game.head.y ] return np.array(state, dtype=int) def get_action(self, state): # random moves: tradeoff between exploration / exploitation self.epsilon = 80 - self.number_of_games / 10 final_move = [0, 0, 0] if random.randint(0, 200) < self.epsilon: move = random.randint(0, 2) final_move[move] = 1 else: initial_state = torch.tensor(state, dtype=torch.float) prediction = self.model(initial_state) move = torch.argmax(prediction).item() final_move[move] = 1 return final_move def remember(self, state, action, reward, next_state, game_over): self.memory.append((state, action, reward, next_state, game_over)) def train_long_memory(self): if len(self.memory) > BATCH_SIZE: sample = random.sample(self.memory, BATCH_SIZE) else: sample = self.memory states, actions, rewards, next_states, game_overs = zip(*sample) self.trainer.train_step(states, actions, rewards, next_states, game_overs) def train_short_memory(self, state, action, reward, next_state, game_over): self.trainer.train_step(state, action, reward, next_state, game_over)
class Agent: # Razred Agent. Agent je posrednik med modelom ter okoljem (igro). def __init__(self): with open('games.txt', 'r') as f: self.n_games = int(f.read()) print(self.n_games) self.epsilon = 0 self.gamma = 0.9 self.memory = deque(maxlen=MAX_MEMORY) self.model = Linear_QNet(11, 256, 3) #self.model.load_state_dict(torch.load('model/model.pth')) self.model.eval() self.trainer = QTrainer(self.model, lr=LR, gamma=self.gamma) # Inicializacija. Prvo si sposodi shranjene rezultate, nastavi nekaj konstant in si izpododi nevronsko mrežo iz datoteke 'model.pth'. # V primeru, da boste ta program zagnali prvič, spremenite vrstice 25-27 v "self.n_games = 0" in vrstico 33 izbrišite. def get_state(self, game): # Funkcija, s katero agent dobi informacije o okolju. head = game.snake[0] point_l = Point(head.x - BLOCK_SIZE, head.y) point_r = Point(head.x + BLOCK_SIZE, head.y) point_u = Point(head.x, head.y - BLOCK_SIZE) point_d = Point(head.x, head.y + BLOCK_SIZE) dir_l = game.direction == Direction.LEFT dir_r = game.direction == Direction.RIGHT dir_u = game.direction == Direction.UP dir_d = game.direction == Direction.DOWN # Definicije spodaj uporabljenih spremenljivk. state = [ # Nevarnost spredaj? (dir_r and game.is_collision(point_r)) or (dir_l and game.is_collision(point_l)) or (dir_u and game.is_collision(point_u)) or (dir_d and game.is_collision(point_d)), # Nevarnost desno? (dir_u and game.is_collision(point_r)) or (dir_d and game.is_collision(point_l)) or (dir_l and game.is_collision(point_u)) or (dir_r and game.is_collision(point_d)), # Nevarnost levo? (dir_d and game.is_collision(point_r)) or (dir_u and game.is_collision(point_l)) or (dir_r and game.is_collision(point_u)) or (dir_l and game.is_collision(point_d)), # Smer kače. dir_l, dir_r, dir_u, dir_d, # Relativni položaj hrane. game.food.x < game.head.x, game.food.x > game.head.x, game.food.y < game.head.y, game.food.y > game.head.y ] return np.array(state, dtype=int) # Vrne podatke agentu. def remember(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) def train_long_memory(self): if len(self.memory) > BATCH_SIZE: mini_sample = random.sample(self.memory, BATCH_SIZE) else: mini_sample = self.memory # Funkcija za ponovno učenje. (Po realni igri model ponovi igro še enkrat). states, actions, rewards, next_states, dones = zip(*mini_sample) self.trainer.train_step(states, actions, rewards, next_states, dones) def train_short_memory(self, state, action, reward, next_state, done): self.trainer.train_step(state, action, reward, next_state, done) # Funkcija za realno-časno učenje. def get_action(self, state): self.epsilon = 500 - self.n_games final_move = [0, 0, 0] if random.randint(0, 500) < self.epsilon: move = random.randint(0, 2) final_move[move] = 1 else: state0 = torch.tensor(state, dtype=torch.float) prediction = self.model(state0) move = torch.argmax(prediction).item() final_move[move] = 1 return final_move
class Agent: def __init__(self): self.n_games = 0 self.epsilon = 0 # randomness self.gamma = 0.9 # discount rate self.memory = deque(maxlen = max_memory) self.model = Linear_QNet(11, 256, 3) PATH = './model/model.pth' if os.path.exists(PATH): self.model.load_state_dict(torch.load(PATH)) # self.model.eval() print('Pretrained = True') self.trainer = QTrainer(self.model, lr = lr, gamma = self.gamma) def get_state(self, game): head = game.snake[0] point_l = Point(head.x - 20, head.y) point_r = Point(head.x + 20, head.y) point_u = Point(head.x, head.y - 20) point_d = Point(head.x, head.y + 20) dir_l = game.direction == Direction.LEFT dir_r = game.direction == Direction.RIGHT dir_u = game.direction == Direction.UP dir_d = game.direction == Direction.DOWN state = [ # Danger straight (dir_r and game.is_collision(point_r)) or (dir_l and game.is_collision(point_l)) or (dir_u and game.is_collision(point_u)) or (dir_d and game.is_collision(point_d)), # Danger right (dir_u and game.is_collision(point_r)) or (dir_d and game.is_collision(point_l)) or (dir_l and game.is_collision(point_u)) or (dir_r and game.is_collision(point_d)), # Danger left (dir_d and game.is_collision(point_r)) or (dir_u and game.is_collision(point_l)) or (dir_r and game.is_collision(point_u)) or (dir_l and game.is_collision(point_d)), # Move direction dir_l, dir_r, dir_u, dir_d, # Food location game.food.x < game.head.x, # food left game.food.x > game.head.x, # food right game.food.y < game.head.y, # food up game.food.y > game.head.y # food down ] return np.array(state, dtype=int) def remember(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) def train_long_memory(self): if len(self.memory) > batch_size: mini_sample = random.sample(self.memory, batch_size) # list of tuples of size = 1000 else: mini_sample = self.memory states, actions, rewards, next_states, dones = zip(*mini_sample) self.trainer.train_step(states, actions, rewards, next_states, dones) def train_short_memory(self, state, action, reward, next_state, done): self.trainer.train_step(state, action, reward, next_state, done) def get_action(self, state): # random moves: tradeoff exploration / exploitation self.epsilon = 80 - self.n_games final_move = [0, 0, 0] if random.randint(0, 200) < self.epsilon: move = random.randint(0, 2) final_move[move] = 1 else: state0 = torch.tensor(state, dtype = torch.float) prediction = self.model(state0) move = torch.argmax(prediction).item() final_move[move] = 1 return final_move
class Agent: def __init__(self): self.n_games = 0 self.epsilon = 0 # randomness self.gamma = 0.9 # discount rate self.memory = deque(maxlen=MAX_MEMORY) # popleft() self.model = Linear_QNet(4, 256, 3) self.trainer = QTrainer(self.model, lr=LR, gamma=self.gamma) def get_state(self, game): head = game.ship.center dir_l = game.ship.moving_left == True dir_r = game.ship.moving_right == True dir_s = game.ship.moving_left == False and game.ship.moving_right == False alienlen10 = len(game.aliens) < 10 alienlen5 = len(game.aliens) < 5 state = [ head, # alienlen10, # alienlen5, dir_l, dir_r, dir_s, # game.ship.rect.left == 0, # game.ship.rect.right == game.ship.screen_rect.right, ] return np.array(state, dtype=int) def remember(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) # popleft if MAX_MEMORY is reached def train_long_memory(self): if len(self.memory) > BATCH_SIZE: mini_sample = random.sample(self.memory, BATCH_SIZE) # list of tuples else: mini_sample = self.memory states, actions, rewards, next_states, dones = zip(*mini_sample) self.trainer.train_step(states, actions, rewards, next_states, dones) #for state, action, reward, nexrt_state, done in mini_sample: # self.trainer.train_step(state, action, reward, next_state, done) def train_short_memory(self, state, action, reward, next_state, done): self.trainer.train_step(state, action, reward, next_state, done) def get_action(self, state): # random moves: tradeoff exploration / exploitation self.epsilon = 80 - self.n_games final_move = [0, 0, 0] if random.randint(0, 200) < self.epsilon: move = random.randint(0, 2) final_move[move] = 1 else: state0 = torch.tensor(state, dtype=torch.float) prediction = self.model(state0) move = torch.argmax(prediction).item() final_move[move] = 1 return final_move
class Agent: def __init__(self): self.n_games = 0 self.epsilon = 0 #randomness self.gamma = 0.9 #discount rate self.memory = deque(maxlen=MAX_MEMORY) #popleft() self.model = Linear_QNet(11, 256, 3) #input_lauer=11,hidden:256 ,output:3 self.model.load_state_dict(torch.load('./optimized_model/model.pth')) self.trainer = QTrainer(self.model, lr=LR, gamma=self.gamma) def get_state(self, game): head = game.snake[0] BLOCK_SIZE = 20 #Points to check danger point_l = Point(head.x - BLOCK_SIZE, head.y) point_r = Point(head.x + BLOCK_SIZE, head.y) point_u = Point(head.x, head.y - BLOCK_SIZE) point_d = Point(head.x, head.y + BLOCK_SIZE) dir_l = game.direction == Direction.LEFT dir_r = game.direction == Direction.RIGHT dir_u = game.direction == Direction.UP dir_d = game.direction == Direction.DOWN state = [ #For straight (dir_r and game.is_collision(point_r)) or (dir_l and game.is_collision(point_l)) or (dir_u and game.is_collision(point_u)) or (dir_d and game.is_collision(point_d)), #Danger Right (dir_u and game.is_collision(point_r)) or (dir_d and game.is_collision(point_l)) or (dir_l and game.is_collision(point_u)) or (dir_r and game.is_collision(point_d)), #Danger left (dir_d and game.is_collision(point_r)) or (dir_u and game.is_collision(point_l)) or (dir_r and game.is_collision(point_u)) or (dir_l and game.is_collision(point_d)), #Move direction dir_l, dir_r, dir_u, dir_d, #Food location game.food.x < game.head.x, # food left game.food.x > game.head.x, # food right game.food.y < game.head.y, # food up game.food.y > game.head.y # food down ] return np.array(state, dtype=int) def remember(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) # popleft if MAX_MEMORY IS REACHED def train_long_memory(self): if len(self.memory) > BATCH_SIZE: mini_sample = random.sample(self.memory, BATCH_SIZE) #list of tuples else: mini_sample = self.memory states, actions, rewards, next_states, dones = zip(*mini_sample) self.trainer.train_step(states, actions, rewards, next_states, dones) #for state, action,reward, next_state, done in mini_sample: # self.trainer.train_step(state, action,reward, next_state, done) def train_short_memory(self, state, action, reward, next_state, done): self.trainer.train_step(state, action, reward, next_state, done) def get_action(self, state): # random moves : tradeoff exploration / exploitation self.epsilon = 80 - self.n_games final_move = [0, 0, 0] if random.randint( 0, 200 ) < self.epsilon and False: #This was original ,we made small changes to it #if random.randint(0,200) < 20 and self.n_games<90: move = random.randint(0, 2) final_move[move] = 1 else: state0 = torch.tensor(state, dtype=torch.float) prediction = self.model.forward(state0) move = torch.argmax(prediction).item() final_move[move] = 1 return final_move
class Agent: def __init__(self): self.n_games = 0 self.n_revise = 0 self.epsilon = 0 # randomness self.gamma = 0.9 # discount rate self.memory = deque(maxlen=MAX_MEMORY) # popleft() self.statusGame = [] self.model = Linear_QNet(11, 256, 3) self.trainer = QTrainer(self.model, lr=LR, gamma=self.gamma) def get_state(self, game): head = game.snake[0] point_l = Point(head.x - 20, head.y) point_r = Point(head.x + 20, head.y) point_u = Point(head.x, head.y - 20) point_d = Point(head.x, head.y + 20) dir_l = game.direction == Direction.LEFT dir_r = game.direction == Direction.RIGHT dir_u = game.direction == Direction.UP dir_d = game.direction == Direction.DOWN state = [ # Danger straight (dir_r and game.is_collision(point_r)) or (dir_l and game.is_collision(point_l)) or (dir_u and game.is_collision(point_u)) or (dir_d and game.is_collision(point_d)), # Danger right (dir_u and game.is_collision(point_r)) or (dir_d and game.is_collision(point_l)) or (dir_l and game.is_collision(point_u)) or (dir_r and game.is_collision(point_d)), # Danger left (dir_d and game.is_collision(point_r)) or (dir_u and game.is_collision(point_l)) or (dir_r and game.is_collision(point_u)) or (dir_l and game.is_collision(point_d)), # Move direction dir_l, dir_r, dir_u, dir_d, # Food location game.food.x < game.head.x, # food left game.food.x > game.head.x, # food right game.food.y < game.head.y, # food up game.food.y > game.head.y # food down ] return np.array(state, dtype=int) def remember(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) # popleft if MAX_MEMORY is reached def addStatus(self, Snake2, Score2, food2, frame_iteration, direction, old_record): Snake = [] food = [food2.x, food2.y] for itemSnack2 in Snake2: item = [itemSnack2.x, itemSnack2.y] Snake.append(item) self.statusGame.append( [Snake, Score2, food, frame_iteration, direction, old_record]) def train_long_memory(self): if len(self.memory) > BATCH_SIZE: mini_sample = random.sample(self.memory, BATCH_SIZE) # list of tuples else: mini_sample = self.memory states, actions, rewards, next_states, dones = zip(*mini_sample) self.trainer.train_step(states, actions, rewards, next_states, dones) #for state, action, reward, nexrt_state, done in mini_sample: # self.trainer.train_step(state, action, reward, next_state, done) def train_short_memory(self, state, action, reward, next_state, done): self.trainer.train_step(state, action, reward, next_state, done) def get_action(self, state): # random moves: tradeoff exploration / exploitation self.epsilon = 80 - self.n_games final_move = [0, 0, 0] if random.randint(0, 200) < self.epsilon: move = random.randint(0, 2) final_move[move] = 1 else: state0 = torch.tensor(state, dtype=torch.float) prediction = self.model(state0) move = torch.argmax(prediction).item() final_move[move] = 1 return final_move
class Agent: def __init__(self, args, model): self.parameters_file = args.parameters_file self.args = args self.parameters = yaml.load(open(self.parameters_file, 'r'), Loader=yaml.FullLoader) self.n_games = 0 self.epsilon = 0 # randomness self.gamma = 0.9 # discount rate self.memory = deque(maxlen=self.parameters["max_memory"]) # popleft() self.model = model self.trainer = QTrainer(self.model, lr=self.parameters["lr"], gamma=self.gamma) def get_state(self, game): head = game.snake[0] point_l = Point(head.x - 20, head.y) point_r = Point(head.x + 20, head.y) point_u = Point(head.x, head.y - 20) point_d = Point(head.x, head.y + 20) dir_l = game.direction == Direction.LEFT dir_r = game.direction == Direction.RIGHT dir_u = game.direction == Direction.UP dir_d = game.direction == Direction.DOWN state = [ # Danger is straight if (dir_r and game.is_collision(point_r)) or (dir_l and game.is_collision(point_l)) or (dir_u and game.is_collision(point_u)) or (dir_d and game.is_collision(point_d)), # Danger is right if (dir_u and game.is_collision(point_r)) or (dir_d and game.is_collision(point_l)) or (dir_l and game.is_collision(point_u)) or (dir_r and game.is_collision(point_d)), # Danger is left if (dir_d and game.is_collision(point_r)) or (dir_u and game.is_collision(point_l)) or (dir_r and game.is_collision(point_u)) or (dir_l and game.is_collision(point_d)), # Move direction dir_l, dir_r, dir_u, dir_d, # Food location game.food.x < game.head.x, # food left game.food.x > game.head.x, # food right game.food.y < game.head.y, # food up game.food.y > game.head.y # food down ] return np.array(state, dtype=int) # converting to 0 or 1 def remember(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) # popleft if MAX_MEMORY is reached def train_long_memory(self): if len(self.memory) > self.parameters["batch_size"]: mini_sample = random.sample( self.memory, self.parameters["batch_size"]) # list of tuples else: mini_sample = self.memory states, actions, rewards, next_states, dones = zip(*mini_sample) self.trainer.train_step(states, actions, rewards, next_states, dones) def train_short_memory(self, state, action, reward, next_state, done): self.trainer.train_step(state, action, reward, next_state, done) def predict(self, state): state_tensor = torch.tensor(state, dtype=torch.float) prediction = self.model(state_tensor) # moves depending on the model move = torch.argmax(prediction).item() return move def get_action(self, state): move = 0 final_move = [0, 0, 0] # random moves: tradeoff exploration / exploitation if self.args.use_trained == True: move = self.predict(state) else: self.epsilon = 100 - self.n_games if random.randint(0, 200) < self.epsilon: move = random.randint(0, 2) else: move = self.predict(state) final_move[move] = 1 return final_move
class Agent: # Initialize agent's parameters def __init__(self): self.n_games = 0 self.epsilon = 0 # randomness self.gamma = 0.9 # discount rate self.memory = deque(maxlen=MAX_MEMORY) self.model = Linear_QNet(11, 258, 3) self.trainer = QTrainer(self.model, lr=LR, gamma=self.gamma) # 11 values represent the state of the game def get_state(self, game): head = game.snake[0] # Clok-wise directions and angles cw_dirs = [ Direction.RIGHT == game.direction, Direction.DOWN == game.direction, Direction.LEFT == game.direction, Direction.UP == game.direction ] cw_angs = np.array([0, np.pi/2, np.pi, -np.pi/2]) # Position - in front: 0, on right: 1, on left: -1; BLOCK_SIZE = 20 getPoint = lambda pos: Point( head.x + 20*np.cos(cw_angs[(cw_dirs.index(True)+pos) % 4]), head.y + 20*np.sin(cw_angs[(cw_dirs.index(True)+pos) % 4])) state = [ # Danger game.is_collision(getPoint(0)), game.is_collision(getPoint(1)), game.is_collision(getPoint(-1)), # Move direction cw_dirs[2], cw_dirs[0], cw_dirs[3], cw_dirs[1], # Food location game.food.x < head.x, game.food.x > head.x, game.food.y < head.y, game.food.y > head.y ] return np.array(state, dtype=int) # Add information of one frame iteration (when play step happens) to memory def remember(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) # Train the model with information based on one full game def train_long_memory(self): if len(self.memory) > BATCH_SIZE: mini_sample = random.sample(self.memory, BATCH_SIZE) # List of tuples else: mini_sample = self.memory states, actions, rewards, next_states, dones = zip(*mini_sample) self.trainer.train_step(states, actions, rewards, next_states, dones) # Train the model with information based on one frame iteration def train_short_memory(self, state, action, reward, next_state, done): self.trainer.train_step(state, action, reward, next_state, done) def get_action(self, state): # Random moves: tradeoff exploration / exploitation self.epsilon = 80 - self.n_games final_move = [0, 0, 0] # The bigger the epsilon, the more likely randint is lower if random.randint(0, 200) < self.epsilon: move = random.randint(0, 2) final_move[move] = 1 else: state0 = torch.tensor(state, dtype=torch.float) prediction = self.model(state0) move = torch.argmax(prediction).item() final_move[move] = 1 return final_move
class Agent: def __init__(self): self.num_games = 0 self.epsilon = 0 # to control the randomness self.gamma = 0.9 # discount rate self.memory = deque(maxlen=MAX_MEMORY) # pop left self.model = Linear_QNet(11, 256, 3) self.trainer = QTrainer(self.model, lr=LR, gamma=self.gamma) def get_state(self, env): head = env.snake[0] point_l = Point(head.x - 20, head.y) point_r = Point(head.x + 20, head.y) point_u = Point(head.x, head.y - 20) point_d = Point(head.x, head.y + 20) dir_l = env.snake_direction == Direction.LEFT dir_r = env.snake_direction == Direction.RIGHT dir_u = env.snake_direction == Direction.UP dir_d = env.snake_direction == Direction.DOWN state = [ # Danger straight (dir_r and env.is_collision(point_r)) or (dir_l and env.is_collision(point_l)) or (dir_u and env.is_collision(point_u)) or (dir_d and env.is_collision(point_d)), # Danger right (dir_u and env.is_collision(point_r)) or (dir_d and env.is_collision(point_l)) or (dir_l and env.is_collision(point_u)) or (dir_r and env.is_collision(point_d)), # Danger left (dir_d and env.is_collision(point_r)) or (dir_u and env.is_collision(point_l)) or (dir_r and env.is_collision(point_u)) or (dir_l and env.is_collision(point_d)), # Move direction dir_l, dir_r, dir_u, dir_d, # Food location env.food.x < env.head_position.x, # food left env.food.x > env.head_position.x, # food right env.food.y < env.head_position.y, # food down env.food.y > env.head_position.y, # food up ] return np.array(state, dtype=int) def store_data(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) # popleft if MAX_MEMORY is reached def train_long_memory(self): # grab one thousand samples from the memory if len(self.memory) > BATCH_SIZE: batch_sample = random.sample(self.memory, BATCH_SIZE) else: batch_sample = self.memory states, actions, rewards, next_states, dones = zip(*batch_sample) self.trainer.train_step(states, actions, rewards, next_states, dones) def train_short_memory(self, state, action, reward, next_state, done): self.trainer.train_step(state, action, reward, next_state, done) def get_action(self, state): # random moves: trade-off between exploration and exploitation self.epsilon = 80 - self.num_games move = [0, 0, 0] if random.randint(0, 200) < self.epsilon: move_idx = random.randint(0, 2) move[move_idx] = 1 else: state0 = torch.tensor(state, dtype=torch.float) prediction = self.model(state0) move_idx = torch.argmax(prediction).item() move[move_idx] = 1 return move
class Agent: def __init__(self, use_checkpoint=False): self.no_of_games = 0 self.epsilon = 0 # randomness self.gamma = 0.9 # discount rate self.memory = deque(maxlen=MAX_MEMORY) self.model = Linear_QNet(11, 256, 3) self.trainer = QTrainer(self.model, lr=LR, gamma=self.gamma) if use_checkpoint: checkpoint = torch.load("./model/model.pth") self.model.load_state_dict(checkpoint) self.model.eval() def get_state(self, game): head = game.snake[0] point_l = Point(head.x - BLOCK_SIZE, head.y) point_r = Point(head.x + BLOCK_SIZE, head.y) point_u = Point(head.x, head.y - BLOCK_SIZE) point_d = Point(head.x, head.y + BLOCK_SIZE) dir_l = game.direction == Direction.LEFT dir_r = game.direction == Direction.RIGHT dir_u = game.direction == Direction.UP dir_d = game.direction == Direction.DOWN state = [ # Danger straight (dir_r and game.is_collision(point_r)) or (dir_l and game.is_collision(point_l)) or (dir_u and game.is_collision(point_u)) or (dir_d and game.is_collision(point_d)), # Danger right (dir_u and game.is_collision(point_r)) or (dir_d and game.is_collision(point_l)) or (dir_l and game.is_collision(point_u)) or (dir_r and game.is_collision(point_d)), # Danger left (dir_d and game.is_collision(point_r)) or (dir_u and game.is_collision(point_l)) or (dir_r and game.is_collision(point_u)) or (dir_l and game.is_collision(point_d)), # Move direction dir_l, dir_r, dir_u, dir_d, # Food location game.food.x < game.head.x, # Food left game.food.x > game.head.x, # Food right game.food.y < game.head.y, # Food up game.food.y > game.head.y, # Food down ] return np.array(state, dtype=int) def remember(self, state, action, reward, next_state, game_over): self.memory.append((state, action, reward, next_state, game_over)) def train_long_memory(self): if len(self.memory) > BATCH_SIZE: mini_sample = random.sample(self.memory, BATCH_SIZE) else: mini_sample = self.memory states, actions, rewards, next_states, game_overs = zip(*mini_sample) self.trainer.train_step(states, actions, rewards, next_states, game_overs) def train_short_memory(self, state, action, reward, next_state, game_over): self.trainer.train_step(state, action, reward, next_state, game_over) def get_action(self, state): self.epsilon = 80 - self.no_of_games action = [0, 0, 0] if random.randint(0, 200) < self.epsilon: move = random.randint(0, 2) action[move] = 1 else: state0 = torch.tensor(state, dtype=torch.float) prediction = self.model(state0) move = torch.argmax(prediction).item() action[move] = 1 return action
class Agent: def __init__(self): self.n_games = 0 self.epsilon = 0 # randomness self.gamma = 0.9 # discount rate self.memory = deque(maxlen=MAX_MEMORY) # popleft() self.model = Linear_QNet(11, 256, 3) # entrada, capa oculta, salida self.trainer = QTrainer(self.model, lr=LR, gamma=self.gamma) def get_state(self, game): head = game.snake[0] point_l = Point(head.x - 20, head.y) point_r = Point(head.x + 20, head.y) point_u = Point(head.x, head.y - 20) point_d = Point(head.x, head.y + 20) dir_l = game.direction == Direction.LEFT dir_r = game.direction == Direction.RIGHT dir_u = game.direction == Direction.UP dir_d = game.direction == Direction.DOWN state = [ # Peligro delante (dir_r and game.is_collision(point_r)) or (dir_l and game.is_collision(point_l)) or (dir_u and game.is_collision(point_u)) or (dir_d and game.is_collision(point_d)), # Peligro derecha (dir_u and game.is_collision(point_r)) or (dir_d and game.is_collision(point_l)) or (dir_l and game.is_collision(point_u)) or (dir_r and game.is_collision(point_d)), # Peligro izquierda (dir_d and game.is_collision(point_r)) or (dir_u and game.is_collision(point_l)) or (dir_r and game.is_collision(point_u)) or (dir_l and game.is_collision(point_d)), # Dirección del movimiento dir_l, dir_r, dir_u, dir_d, # Localización comida game.food.x < game.head.x, # comida izquierda game.food.x > game.head.x, # comida derecha game.food.y < game.head.y, # comida arriba game.food.y > game.head.y # comida abajo ] return np.array(state, dtype=int) def remember(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) # popleft if MAX_MEMORY is reached def train_long_memory(self): if len(self.memory) > BATCH_SIZE: mini_sample = random.sample(self.memory, BATCH_SIZE) # list of tuples else: mini_sample = self.memory states, actions, rewards, next_states, dones = zip(*mini_sample) self.trainer.train_step(states, actions, rewards, next_states, dones) #for state, action, reward, nexrt_state, done in mini_sample: # self.trainer.train_step(state, action, reward, next_state, done) def train_short_memory(self, state, action, reward, next_state, done): self.trainer.train_step(state, action, reward, next_state, done) def get_action(self, state): # random moves: tradeoff exploration / exploitation self.epsilon = 80 - self.n_games final_move = [0,0,0] if random.randint(0, 200) < self.epsilon: move = random.randint(0, 2) final_move[move] = 1 else: state0 = torch.tensor(state, dtype=torch.float) prediction = self.model(state0) move = torch.argmax(prediction).item() final_move[move] = 1 return final_move
class Agent: def __init__(self, agent_cfg) -> None: self.n_games = 0 self.agent_cfg = agent_cfg self.epsilon = agent_cfg.epsilon # randomness self.random_until = agent_cfg.random_until self.memory = deque(maxlen=agent_cfg.max_memory_size) self.model = LinearQNet(agent_cfg.model) self.trainer = QTrainer(self.model, agent_cfg.lr, agent_cfg.gamma) def get_state(self, game): head = game.snake[0] last_actions = self.get_previous_actions(self.agent_cfg.state.lookback) point_l = Point(head.x - game.block_size, head.y) point_r = Point(head.x + game.block_size, head.y) point_u = Point(head.x, head.y - game.block_size) point_d = Point(head.x, head.y + game.block_size) dir_l = game.direction == Direction.LEFT dir_r = game.direction == Direction.RIGHT dir_u = game.direction == Direction.UP dir_d = game.direction == Direction.DOWN state = [ # Danger straight (dir_r and game.is_collision(point_r)) or (dir_l and game.is_collision(point_l)) or (dir_u and game.is_collision(point_u)) or (dir_d and game.is_collision(point_d)), # Danger right (dir_u and game.is_collision(point_r)) or (dir_d and game.is_collision(point_l)) or (dir_l and game.is_collision(point_u)) or (dir_r and game.is_collision(point_d)), # Danger left (dir_d and game.is_collision(point_r)) or (dir_u and game.is_collision(point_l)) or (dir_r and game.is_collision(point_u)) or (dir_l and game.is_collision(point_d)), # Move direction dir_l, dir_r, dir_u, dir_d, # Food location game.food.x < game.head.x, # food left game.food.x > game.head.x, # food right game.food.y < game.head.y, # food up game.food.y > game.head.y, # food down, # distance from food (game.food.x - game.head.x) / game.w, (game.food.y - game.head.y) / game.h ] state += [direction for action in last_actions for direction in action] assert len(state) == self.model.input_size return np.array(state, dtype=int) def get_previous_actions(self, n: int) -> List[List[int]]: """ Get a list of the previous integer encoded actions """ default_action = [0 for _ in range(self.model.output_size)] actions = [default_action for _ in range(n)] for i in range(min(n, len(self.memory))): step = self.memory[-(i + 1)] step_action = step[1] actions[i] = step_action return actions def remember(self, state, action, reward, next_state, gameover): self.memory.append((state, action, reward, next_state, gameover)) def sample_from_memory(self, n_samples: int): N = len(self.memory) if self.agent_cfg.batch.method == 'linear': samples = np.random.triangular(0, N, N, n_samples) idxs = samples.astype(int) batch = [self.memory[idx] for idx in idxs] elif self.agent_cfg.batch.method == 'uniform': batch = random.choice(self.memory, n_samples) else: raise ValueError('Invalid choice for `agent.batch.method`') return batch def train_long_memory(self): if len(self.memory) > self.agent_cfg.batch.size: mini_batch = self.sample_from_memory( self.agent_cfg.batch.size) # list of tuples else: mini_batch = self.memory states, actions, rewards, next_states, gameovers = zip(*mini_batch) self.trainer.train_step(states, actions, rewards, next_states, gameovers) def train_short_memory(self, state, action, reward, next_state, gameover): self.trainer.train_step(state, action, reward, next_state, gameover) def get_action(self, state): # random moves: exploration exploitation tradeoff # self.epsilon = 80 - self.n_games action = [0 for _ in range(self.model.output_size)] rand_action_thresh = self.epsilon - (self.n_games * self.epsilon / self.random_until) if random.random() < rand_action_thresh: move = random.randint(0, len(action) - 1) else: prediction = self.model(torch.tensor(state, dtype=torch.float)) move = int(prediction.argmax().item()) action[move] = 1 return action
class Agent: """ Agent class agent running and the snake """ def __init__(self, game, pars=dict()): """ (Agent, Snake, dict()) -> None Initialize everything get everything that is passed from json file to modify attributes and train model """ self.n_games = 0 self.epsilon = pars.get('eps', EPSILON) self.eps = pars.get('eps', EPSILON) self.gamma = pars.get('gamma', GAMMA) # discount rate self.eps_range = pars.get('eps_range', EPS_RANGE) print(self.epsilon, self.eps) self.memory = deque(maxlen=MAX_MEMORY) # popleft() self.model = Linear_QNet(len(game.get_state()), pars.get('hidden_size', HIDDEN_SIZE), OUTPUT_SIZE) self.trainer = QTrainer(self.model, lr=pars.get('lr', LR), gamma=self.gamma) self.game = game def remember(self, *args): """ (Agent, (float, float, float, float, bool)) -> None state: current state action: current actions reward: current immediate rewards next_state: get the next state done: terminal state point append all this attributes to the queue: memory do this every frame """ state, action, reward, next_state, done = args self.memory.append((state, action, reward, next_state, done)) def train_long_memory(self): """ (Agent) -> None train after every game is finished """ # get memory # if memory is above a certain BATCH SIZE then # randomly sample BACTCH SIZE memory if len(self.memory) > BATCH_SIZE: mini_sample = random.sample(self.memory, BATCH_SIZE) # list of tuples else: mini_sample = self.memory # get all states actions, rewards, etc... # and train the step using QTrainer states, actions, rewards, next_states, dones = zip(*mini_sample) self.trainer.train_step(states, actions, rewards, next_states, dones) def train_short_memory(self, *args): """ (Agent, (float, float, float, float, bool)) -> None state: current state action: current actions reward: current immediate rewards next_state: get the next state done: terminal state point train agent every game frame """ state, action, reward, next_state, done = args self.trainer.train_step(state, action, reward, next_state, done) def get_action(self, state): """ (Agent, float) -> np.array(dtype=int): (1, 3) get an action either from the policy or randomly """ # tradeoff exploration / exploitation based on epsilon and eps_range self.epsilon = self.eps - self.n_games final_move = [0, 0, 0] # check if should move randomly if is_random_move(self.epsilon, self.eps_range): # if so then randomly turn one of the bits # to go right left or straight move = random.randint(0, 2) final_move[move] = 1 else: # else get the best move from the # NN by taking its argmax and setting # its bits state0 = torch.tensor(state, dtype=torch.float) prediction = self.model(state0) move = torch.argmax(prediction).item() final_move[move] = 1 return final_move
class Agent: def __init__(self): self.n_games = 0 self.n_state = 14 self.frame_to_read = 1 self.epsilon = 0.4 self.gamma = 0.8 self.memory = deque(maxlen=MAX_MEM) self.states = deque(maxlen=self.frame_to_read) for _ in range(self.frame_to_read): self.states.append([0 for _ in range(self.n_state)]) self.trainer = QTrainer(self.n_state * self.frame_to_read, LR, self.n_state * self.frame_to_read, [256, 256], 3, self.gamma) def get_state(self, game): head = game.snake[0] point_l = Point(head.x - BLOCK_SIZE, head.y) point_r = Point(head.x + BLOCK_SIZE, head.y) point_u = Point(head.x, head.y - BLOCK_SIZE) point_d = Point(head.x, head.y + BLOCK_SIZE) dir_l = game.direction == Direction.LEFT dir_r = game.direction == Direction.RIGHT dir_u = game.direction == Direction.UP dir_d = game.direction == Direction.DOWN wall_u = 1 / abs(game.head.y + 21) wall_d = 1 / abs(game.h - game.head.y + 1) wall_l = 1 / abs(game.head.x + 21) wall_r = 1 / abs(game.w - game.head.x + 1) food_u = 1 if game.food.y < game.head.y else 0 food_d = 1 if game.food.y > game.head.y else 0 food_l = 1 if game.food.x < game.head.x else 0 food_r = 1 if game.food.x > game.head.x else 0 self_s = float('inf') self_l = float('inf') self_r = float('inf') for i, b in enumerate(game.snake): if i == 0: continue if dir_l: if b.x <= game.head.x and b.y == game.head.y: self_s = min(self_s, game.head.x - b.x) if b.y <= game.head.y and b.x == game.head.x: self_r = min(self_r, game.head.y - b.y) if b.y >= game.head.y and b.x == game.head.x: self_l = min(self_l, b.y - game.head.y) if dir_r: if b.x >= game.head.x and b.y == game.head.y: self_s = min(self_s, b.x - game.head.x) if b.y >= game.head.y and b.x == game.head.x: self_r = min(self_r, b.y - game.head.y) if b.y <= game.head.y and b.x == game.head.x: self_l = min(self_l, game.head.y - b.y) if dir_u: if b.y <= game.head.y and b.x == game.head.x: self_s = min(self_s, game.head.y - b.y) if b.x >= game.head.x and b.y == game.head.y: self_r = min(self_r, b.x - game.head.x) if b.x <= game.head.x and b.y == game.head.y: self_l = min(self_l, game.head.x - b.x) if dir_d: if b.y >= game.head.y and b.x == game.head.x: self_s = min(self_s, b.y - game.head.y) if b.x <= game.head.x and b.y == game.head.y: self_r = min(self_r, game.head.x - b.x) if b.x >= game.head.x and b.y == game.head.y: self_l = min(self_l, b.x - game.head.x) self_s = 1 / (self_s + 1) self_r = 1 / (self_r + 1) self_l = 1 / (self_l + 1) state = [ # danger straight (dir_r and game.is_collision(point_r)) or (dir_l and game.is_collision(point_l)) or (dir_u and game.is_collision(point_u)) or (dir_d and game.is_collision(point_d)), # danger right (dir_u and game.is_collision(point_r)) or (dir_d and game.is_collision(point_l)) or (dir_l and game.is_collision(point_u)) or (dir_r and game.is_collision(point_d)), # danger left (dir_d and game.is_collision(point_r)) or (dir_u and game.is_collision(point_l)) or (dir_r and game.is_collision(point_u)) or (dir_l and game.is_collision(point_d)), # to body coefficient self_s, self_r, self_l, # move direction dir_l, dir_r, dir_u, dir_d, # food loc game.food.x < game.head.x, game.food.x > game.head.x, game.food.y < game.head.y, game.food.y > game.head.y, ] print(state) self.states.append(state) return np.array(list(self.states), dtype=np.float32).reshape( (-1, self.n_state * self.frame_to_read)) def remember(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) def train_long_memory(self): if len(self.memory) > BATCH_SIZE: minibatch = random.sample(self.memory, BATCH_SIZE) else: minibatch = self.memory states, actions, rewards, next_states, dones = zip(*minibatch) self.trainer.train_step(states, actions, rewards, next_states, dones) def train_short_memory(self, state, action, reward, next_state, done): print('reward =', reward) self.trainer.train_step((state, ), (action, ), (reward, ), (next_state, ), (done, )) def get_action(self, state): # random move final_move = [0, 0, 0] if random.random() < self.epsilon: print('random behavior') move = random.randint(0, 2) final_move[move] = 1 else: prediction = self.trainer.model.predict(state) move = np.argmax(prediction) final_move[move] = 1 return final_move
class Agent: def __init__(self): self.n_games = 0 self.epsilon = 0 #reandonmess self.gamma = 0.9 #discount rate self.model = Linear_Qnet(11, 256, 3) self.trainer = QTrainer(self.model, lr=LR, gamma=self.gamma) self.memory = deque(maxlen=MAX_MEMEORY) def get_state(self, game): head = game.snake[0] point_l = Point(head.x - 20, head.y) point_r = Point(head.x + 20, head.y) point_u = Point(head.x, head.y - 20) point_d = Point(head.x, head.y + 20) dir_l = game.direction == Direction.LEFT dir_r = game.direction == Direction.RIGHT dir_u = game.direction == Direction.UP dir_d = game.direction == Direction.DOWN state = [ #danger (dir_r and game.is_collison(point_r)) or (dir_l and game.is_collison(point_l)) or (dir_u and game.is_collison(point_u)) or (dir_d and game.is_collison(point_d)), #right (dir_u and game.is_collison(point_r)) or (dir_d and game.is_collison(point_l)) or (dir_l and game.is_collison(point_u)) or (dir_r and game.is_collison(point_d)), #left (dir_d and game.is_collison(point_r)) or (dir_u and game.is_collison(point_l)) or (dir_r and game.is_collison(point_u)) or (dir_l and game.is_collison(point_d)), #move dir_l, dir_r, dir_u, dir_d, #food location game.food.x < game.head.x, #left game.food.x > game.head.x, #left game.food.y < game.head.y, #left game.food.y > game.head.y #left ] return np.array(state, dtype=int) def remember(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) def train_long_memory(self): if len(self.memory) > BATCH_SIZE: mini_sample = random.sample(self.memory, BATCH_SIZE) #list of tuples else: mini_sample = self.memory states, actions, rewards, next_states, dones = zip(*mini_sample) self.trainer.train_step(states, actions, rewards, next_states, dones) def train_short_memory(self, state, action, reward, next_state, done): self.trainer.train_step(state, action, reward, next_state, done) def get_action(self, state): #random moves self.epsilon = 80 - self.n_games final_move = [0, 0, 0] if random.randint(0, 200) < self.epsilon: move = random.randint(0, 2) final_move[move] = 1 else: state0 = torch.tensor(state, dtype=torch.float) prediction = self.model(state0) move = torch.argmax(prediction).item() final_move[move] = 1 return final_move
class Agent: def __init__(self): self.n_game = 0 self.epsilon = 0 # Randomness self.gamma = 0.9 # discount rate self.memory = deque(maxlen=MAX_MEMORY) # popleft() self.model = Linear_QNet(11, 256, 3) self.trainer = QTrainer(self.model, lr=LR, gamma=self.gamma) # for n,p in self.model.named_parameters(): # print(p.device,'',n) # self.model.to('cuda') # for n,p in self.model.named_parameters(): # print(p.device,'',n) # TODO: model,trainer # state (11 Values) #[ danger straight, danger right, danger left, # # direction left, direction right, # direction up, direction down # # food left,food right, # food up, food down] def get_state(self, game): head = game.snake[0] point_l = Point(head.x - BLOCK_SIZE, head.y) point_r = Point(head.x + BLOCK_SIZE, head.y) point_u = Point(head.x, head.y - BLOCK_SIZE) point_d = Point(head.x, head.y + BLOCK_SIZE) dir_l = game.direction == Direction.LEFT dir_r = game.direction == Direction.RIGHT dir_u = game.direction == Direction.UP dir_d = game.direction == Direction.DOWN state = [ # Danger Straight (dir_u and game.is_collision(point_u)) or (dir_d and game.is_collision(point_d)) or (dir_l and game.is_collision(point_l)) or (dir_r and game.is_collision(point_r)), # Danger right (dir_u and game.is_collision(point_r)) or (dir_d and game.is_collision(point_l)) or (dir_u and game.is_collision(point_u)) or (dir_d and game.is_collision(point_d)), #Danger Left (dir_u and game.is_collision(point_r)) or (dir_d and game.is_collision(point_l)) or (dir_r and game.is_collision(point_u)) or (dir_l and game.is_collision(point_d)), # Move Direction dir_l, dir_r, dir_u, dir_d, #Food Location game.food.x < game.head.x, # food is in left game.food.x > game.head.x, # food is in right game.food.y < game.head.y, # food is up game.food.y > game.head.y # food is down ] return np.array(state, dtype=int) def remember(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) # popleft if memory exceed def train_long_memory(self): if (len(self.memory) > BATCH_SIZE): mini_sample = random.sample(self.memory, BATCH_SIZE) else: mini_sample = self.memory states, actions, rewards, next_states, dones = zip(*mini_sample) self.trainer.train_step(states, actions, rewards, next_states, dones) def train_short_memory(self, state, action, reward, next_state, done): self.trainer.train_step(state, action, reward, next_state, done) def get_action(self, state): # random moves: tradeoff explotation / exploitation self.epsilon = 80 - self.n_game final_move = [0, 0, 0] if (random.randint(0, 200) < self.epsilon): move = random.randint(0, 2) final_move[move] = 1 else: state0 = torch.tensor(state, dtype=torch.float).cuda() prediction = self.model(state0).cuda() # prediction by model move = torch.argmax(prediction).item() final_move[move] = 1 return final_move