def step(snakes: list, food: Food, action): # First snake must be the player7 player = snakes[0] player.action([action], "AC") done = False obs = "Wrong Input" # currently overall reward increases every time food is eaten handleFoodEating(snakes, food) # the current state after move has been done # could be the whole f*****g grid or maybe just part of it around snakes head # for now let me try making it a 5x5 grid around its head if obs_type == "Grid": obs = getObsGrid(snakes, food, OBS_GRID_SIZE, fullGrid=False) if obs_type == "Small": obs = getObsSmall(snakes, food) info = "" return obs, player.reward, done, info
return obs, player.reward, done, info score_history = [] score = 0 n_steps_history = [] for i in tqdm(range(num_episodes)): player = Snake(0) enemy = Snake(1) enemy.positions = [(random.randint(0, SIZE - 1), random.randint(0, SIZE - 1))] food = Food([player]) done = False score = 0 # returns a numpy array of the state we care about observation = getObsGrid(snakes=[player], food=food, size=OBS_GRID_SIZE, fullGrid=False) #observation = getObsSmall([player, enemy], food) n_steps = 0 while not done and n_steps < 100: n_steps += 1 # action needs to be either 0,1,2 or 3 action = agentAC.choose_action(observation) observation_, reward, done, info = step(snakes=[player, enemy], food=food, action=action, obs_type=obs_type) agentAC.learn(observation, reward, observation_, done) # For Actor-Critic #agent.store_rewards(reward) # For REINFORCE observation = observation_ score += reward score_history.append(score) n_steps_history.append(n_steps) if i % SHOW_EVERY == 0: #print(f"on #{i}, epsilon is {lr}")