示例#1
0
def monte_carlo_demo():
    np.random.seed(101)
    env = SnakeEnv(10, [3, 6])
    agent = ModelFreeAgent(env)
    mc = MonteCarlo(0.5)
    with timer('Timer Monte Carlo Iter'):
        mc.monte_carlo_opt(agent, env)
    print('MonteCarlo:return_pi={}'.format(eval_game(env, agent)))
    print(agent.pi)

    np.random.seed(101)
    env = SnakeEnv(10, [3, 6])
    agent2 = TableAgent(env)
    pi_algo = PolicyIteration()
    with timer('Timer PolicyIter'):
        pi_algo.policy_iteration(agent2)
    print('PolicyIteration:return_pi={}'.format(eval_game(env, agent2)))
    print(agent2.pi)

    np.random.seed(101)
    env = SnakeEnv(10, [3, 6])
    agent3 = ModelFreeAgent(env)
    mc = QLearning(0.5)
    with timer('Timer Monte Carlo Iter'):
        mc.q_learning(agent3, env)
    print('QLearning:return_pi={}'.format(eval_game(env, agent3)))
    print(agent3.pi)
示例#2
0
def policy_iteration_demo1():
    env = SnakeEnv(0, [3, 6])
    agent = TableAgent(env)
    pi_algo = PolicyIteration()
    pi_algo.policy_iteration(agent)
    print('return_pi={}'.format(eval_game(env, agent)))
    print(agent.pi)
示例#3
0
def policy_iteration_demo1():
    env = SnakeEnv(0, [3, 6])  #0代表不考虑梯子
    agent = TableAgent(env)  #表agent
    pi_algo = PolicyIteration()  #策略迭代模型
    pi_algo.policy_iteration(agent)  #获得新一时刻的状态值函数
    print 'return_pi={}'.format(eval_game(env, agent))
    print agent.pi
示例#4
0
def generalized_iteration_demo():
    np.random.seed(0)
    env = SnakeEnv(10, [3, 6])
    agent = TableAgent(env)
    pi_algo = GeneralizedPolicyIteration()
    with timer('Timer GeneralizedIter'):
        pi_algo.generalized_policy_iteration(agent)
    print('return_pi={}'.format(eval_game(env, agent)))
def monte_carlo_demo2():
    env = SnakeEnv(10, [3, 6])
    agent = ModelFreeAgent(env)
    mc = MonteCarlo(0.5)
    with timer('Timer Monte Carlo Iter'):
        mc.monte_carlo_opt(agent, env)
    print('return_pi={}'.format(eval_game(env, agent)))
    print(agent.pi)
示例#6
0
def value_iteration_demo():
    np.random.seed(0)
    env = SnakeEnv(10, [3, 6])
    agent = TableAgent(env)
    vi_algo = ValueIteration()
    vi_algo.value_iteration(agent)
    print('return_pi={}'.format(eval_game(env, agent)))
    print(agent.pi)
示例#7
0
def value_iteration_demo():
    np.random.seed(0)
    env = SnakeEnv(10, [3, 6])
    agent = TableAgent(env)
    pi_algo = ValueIteration()
    with timer('Timer ValueIter'):
        pi_algo.value_iteration(agent)
    print 'return_pi={}'.format(eval_game(env, agent))
示例#8
0
def policy_iteration_demo():
    np.random.seed(0)
    env = SnakeEnv(10, [3, 6])
    agent = TableAgent(env)
    pi_algo = PolicyIterationWithTimer()
    pi_algo.policy_iteration(agent)
    print('return_pi={}'.format(eval_game(env, agent)))
    print(agent.pi)
示例#9
0
def main():
    times = []
    env = SnakeEnv()
    for i in range(100):
        st = time.time()
        done = False
        env.reset()
        score = 0
        food = 0
        while not done:
            info = {"Food": (food, (10, 30))}
            state, reward, done = env.step(get_input(), info=info)
            score += reward
            if reward == settings.FOOD_REWARD:
                food += 1

            env.render(sleep=False)
            # print(reward)
            if done:
                et = time.time()
                times.append(et - st)
                # quit()
                break
    print(1 / (mean(times)), end=" games per second\n")
    print(1 / (max(times)), end=" slowest games per second\n")
    print(1 / (min(times)), end=" fastest games per second\n")
示例#10
0
def test_easy():
    np.random.seed(0)
    sum_opt = 0
    sum_0 = 0
    sum_1 = 0
    env = SnakeEnv(0, [3, 6])
    for i in range(10000):
        sum_opt += eval_game(env, policy_ref)
        sum_0 += eval_game(env, policy_0)
        sum_1 += eval_game(env, policy_1)
    print('opt avg={}'.format(sum_opt / 10000.0))
    print('0 avg={}'.format(sum_0 / 10000.0))
    print('1 avg={}'.format(sum_1 / 10000.0))
示例#11
0
def policy_iteration_demo2():
    env = SnakeEnv(10, [3, 6])
    agent = TableAgent(env)
    agent.pi[:] = 0
    print('return3={}'.format(eval_game(env, agent)))
    agent.pi[:] = 1
    print('return6={}'.format(eval_game(env, agent)))
    agent.pi[97:100] = 0
    print('return_ensemble={}'.format(eval_game(env, agent)))
    pi_algo = PolicyIteration()
    pi_algo.policy_iteration(agent)
    print('return_pi={}'.format(eval_game(env, agent)))
    print(agent.pi)
def first_easy():
    sum_opt = 0
    sum_0 = 0
    sum_1 = 0
    env = SnakeEnv(0, [3, 6])
    countNum = 10000
    for i in range(countNum):
        sum_opt += eval_game(env, policy_ref)
        sum_0 += eval_game(env, policy_0)
        sum_1 += eval_game(env, policy_1)
    print('policy_ref avg={}'.format(sum_opt / countNum))
    print('policy_0 avg={}'.format(sum_0 / countNum))
    print('policy_1 avg={}'.format(sum_1 / countNum))
def monte_carlo_demo():
    env = SnakeEnv(10, [3, 6])
    agent = ModelFreeAgent(env)
    mc = MonteCarlo()
    with timer('Timer Monte Carlo Iter'):
        mc.monte_carlo_opt(agent, env)
    print('return_pi={}'.format(eval_game(env, agent)))
    print(agent.pi)

    agent2 = TableAgent(env)
    pi_algo = PolicyIteration()
    with timer('Timer PolicyIter'):
        pi_algo.policy_iteration(agent2)
    print('return_pi={}'.format(eval_game(env, agent2)))
    print(agent2.pi)
示例#14
0
文件: dqn.py 项目: dilithjay/SnakeRL
from snake import SnakeEnv
import tensorflow as tf
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
import time
import pickle
import json

with open("config.json", 'r') as conf:
    config = json.load(conf)
WIDTH, HEIGHT = config['width'], config['height']
snake = SnakeEnv(width=WIDTH, height=HEIGHT)
num_steps = 10**6
FPS = 60

# Configuration parameters for the whole setup
seed = 42
gamma = config['gamma']  # Discount factor for past rewards
epsilon = config['epsilon']  # Epsilon greedy parameter
epsilon_min = config['epsilon_min']  # Minimum epsilon greedy parameter
epsilon_max = config['epsilon_max']  # Maximum epsilon greedy parameter
epsilon_interval = (
    epsilon_max - epsilon_min
)  # Rate at which to reduce chance of random action being taken
batch_size = config['batch_size']  # Size of batch taken from replay buffer

max_steps_per_episode = config['max_steps_per_episode']
# Number of frames to take random action and observe output
epsilon_random_frames = config["epsilon_random_frames"]
# Number of frames for exploration
# Stats settings
GET_STATS = 10
MODEL_SAVE = True

# Render
ISRENDER = True

# For stats
ep_rewards = [-200]
scores_history = [-100] * 20000

if not os.path.isdir('models-final-check'):
    os.makedirs('models-final-check')

env = SnakeEnv()
agent = DeepQAgent(env)
for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit="episode"):
    agent.tensorboard.step = episode
    #Initialise state,reward.
    episode_reward = 0
    step = 1
    current_state = env.reset()
    done = False

    while not done:  # Using Exploitation vs Exploration ($\epsilon$-greedy strategy) to either choose a random action or  a greedy action and pre-process it for further steps.
        if np.random.random() > epsilon:
            action = np.argmax(agent.get_qs(current_state))
        else:
            action = np.random.randint(0, env.ACTION_SPACE_SIZE)
示例#16
0
 def __init__(self, args):
     self.args = args
     self.env = SnakeEnv(args.snake_head_x, args.snake_head_y, args.food_x,
                         args.food_y)
     self.agent = Agent(self.env.get_actions(), args.Ne, args.C, args.gamma)
示例#17
0
class Application:
    def __init__(self, args):
        self.args = args
        self.env = SnakeEnv(args.snake_head_x, args.snake_head_y, args.food_x,
                            args.food_y)
        self.agent = Agent(self.env.get_actions(), args.Ne, args.C, args.gamma)

    def execute(self):
        if not self.args.human:
            if self.args.train_eps != 0:
                self.train()
            return self.test()
        self.show_games()

    def train(self):
        print("Train Phase:")
        self.agent.train()
        window = self.args.window
        self.points_results = []
        first_eat = True
        start = time.time()

        for game in range(1, self.args.train_eps + 1):
            state = self.env.get_state()
            dead = False
            action = self.agent.act(state, 0, dead)
            while not dead:
                state, points, dead = self.env.step(action)

                # For debug convenience, you can check if your Q-table mathches ours for given setting of parameters
                # (see Debug Convenience part on homework 4 web page)
                if first_eat and points == 1:
                    self.agent.save_model(utils.CHECKPOINT)
                    first_eat = False

                action = self.agent.act(state, points, dead)

            points = self.env.get_points()
            self.points_results.append(points)
            if game % self.args.window == 0:
                print(
                    "Games:",
                    len(self.points_results) - window,
                    "-",
                    len(self.points_results),
                    "Points (Average:",
                    sum(self.points_results[-window:]) / window,
                    "Max:",
                    max(self.points_results[-window:]),
                    "Min:",
                    min(self.points_results[-window:]),
                    ")",
                )
            self.env.reset()
        print("Training takes", time.time() - start, "seconds")
        self.agent.save_model(self.args.model_name)

    def test(self):
        print("Test Phase:")
        self.agent.eval()
        self.agent.load_model(self.args.model_name)
        points_results = []
        start = time.time()

        for game in range(1, self.args.test_eps + 1):
            state = self.env.get_state()
            dead = False
            action = self.agent.act(state, 0, dead)
            while not dead:
                state, points, dead = self.env.step(action)
                action = self.agent.act(state, points, dead)
            points = self.env.get_points()
            points_results.append(points)
            self.env.reset()

        print("Testing takes", time.time() - start, "seconds")
        print("Number of Games:", len(points_results))
        print("Average Points:", sum(points_results) / len(points_results))
        print("Max Points:", max(points_results))
        print("Min Points:", min(points_results))
        return sum(points_results) / len(points_results)

    def show_games(self):
        print("Display Games")
        self.env.display()
        pygame.event.pump()
        self.agent.eval()
        points_results = []
        end = False
        for game in range(1, self.args.show_eps + 1):
            state = self.env.get_state()
            dead = False
            action = self.agent.act(state, 0, dead)
            count = 0
            while not dead:
                count += 1
                pygame.event.pump()
                keys = pygame.key.get_pressed()
                if keys[K_ESCAPE] or self.check_quit():
                    end = True
                    break
                state, points, dead = self.env.step(action)
                # Qlearning agent
                if not self.args.human:
                    action = self.agent.act(state, points, dead)
                # for human player
                else:
                    print((state[0] + 1) // 40, (state[1] + 1) // 40)
                    for event in pygame.event.get():
                        if event.type == pygame.KEYDOWN:
                            if event.key == pygame.K_UP:
                                action = 0
                            elif event.key == pygame.K_DOWN:
                                action = 1
                            elif event.key == pygame.K_LEFT:
                                action = 2
                            elif event.key == pygame.K_RIGHT:
                                action = 3
            if end:
                break
            self.env.reset()
            points_results.append(points)
            print("Game:",
                  str(game) + "/" + str(self.args.show_eps), "Points:", points)
        if len(points_results) == 0:
            return
        print("Average Points:", sum(points_results) / len(points_results))

    def check_quit(self):
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                return True
        return False
示例#18
0
 def __init__(self, args):
     x_train=np.load("q_agent.npy")
     print(x_train)
     self.args = args
     self.env = SnakeEnv(args.snake_head_x, args.snake_head_y, args.food_x, args.food_y)
     self.agent = Agent(self.env.get_actions(), args.Ne, args.C, args.gamma)
示例#19
0
                    type=int,
                    nargs='?',
                    default=8,
                    help='Zoom per dimension')
parser.add_argument('--fps',
                    type=int,
                    nargs='?',
                    default=10,
                    help='Frames per second')

tensorize = lambda t: torch.FloatTensor(t.transpose(
    (2, 0, 1)).copy()).unsqueeze(0)

if __name__ == "__main__":
    args = parser.parse_args()
    env = SnakeEnv(args.dim, zoom=args.zoom)
    pyglet.clock.set_fps_limit(args.fps)
    global a, policy
    if (args.filename is None):
        a = np.random.randint(4)
        from pyglet.window import key

        def key_press(k, mod):
            global restart
            global a
            if k == key.R: restart = True
            if k == key.UP: a = 0
            if k == key.DOWN: a = 1
            if k == key.LEFT: a = 2
            if k == key.RIGHT: a = 3