def analyze_methods(): i = 1 n = 2 #number of grids LVL = "EASY" filename = "analyse_" + LVL while (i <= n): G = Game.random_generation(10 * i, 10 * i, LVL) if not G.is_winnable(): continue ####--------------------- ITV # t1 = time.time() # policy = G.mdp.run_value_iteration(0.01) # t2 = time.time() - t1 # nb_mouvement = G.play_with_policy(policy,False) # # f = open(filename+"_ITV","a") # f.write(str(nb_mouvement)+" "+str(t2)+" "+str(i*10)+" "+str(i*10)+"\n") # f.close() ####--------------------- LP # Game = new Game(".game") # t1 = time.time() # policy = G.mdp.run_linear_programming_resolution() # t2 = time.time() - t1 # nb_mouvement = G.play_with_policy(policy,False) # f = open(filename+"_LP","a") # f.write(str(nb_mouvement)+" "+str(t2)+" "+str(i*10)+" "+str(i*10)+"\n") # f.close() ####--------------------- QL QL = QLearning(".game") t1 = time.time() policy = QL.run_Q_learning() t2 = time.time() - t1 G = Game(".game") nb_mouvement = G.play_with_policy(policy, False) f = open(filename + "_QL", "a") f.write( str(nb_mouvement) + " " + str(t2) + " " + str(i * 10) + " " + str(i * 10) + "\n") f.close() i += 1
def performAction(self, state): if self.lastNodeId: if self.isSuccess(self.nodes.getNode(self.lastStepId), state): self.nodes.getNode(self.lastNodeId).addSuccess() self.nodes.getNode(self.lastStepId).addSuccess() root = self.nodes.getNode(state) if root.isNew(): root.children = self.setChildren(state, root) root.setScore(self.calculateScore(state)) ql.run(root) nextState = self.getWinningPath(root.id) nextNode = self.nodes.getNode(nextState) self.lastStepId = nextNode.id self.updateProbabilities() self.nodes.saveNode(nextNode) self.nodes.saveNode(root)
def get_alg(name, args, env): if name == "oracleq": alg_dict = {'horizon': args.horizon, 'alpha': args.lr, 'conf': args.conf } alg = OracleQ.OracleQ(env.action_space.n, params=alg_dict) elif name == 'decoding': alg_dict = {'horizon': env.horizon, 'model_type': args.model_type, 'n': args.n, 'num_cluster': args.num_cluster} alg = Decoding.Decoding(env.observation_space.n, env.action_space.n,params=alg_dict) elif name=='qlearning': assert args.tabular, "[EXPERIMENT] Must run QLearning in tabular mode" alg_dict = { 'alpha': float(args.lr), 'epsfrac': float(args.epsfrac), 'num_episodes': int(args.episodes)} alg = QLearning.QLearning(env.action_space.n, params=alg_dict) elif name == 'linq': alg_dict = { 'horizon': env.horizon, 'conf': args.conf } alg = LinQ.LinQ(env.observation_space.n, env.action_space.n,params=alg_dict) return (alg)
def training_one(discountRate = 0.98, actionProbabilityBase = 1.8, trainingIterations = 20000, mountainCarBinsPerDimension = 20, render = False, randomActionRate = 0.01, learningRateScale = 0.01, use_memory=False): qlearner = QLearning.QLearning(stateSpaceShape=Assignment7Support.MountainCarStateSpaceShape(mountainCarBinsPerDimension), numActions=env.action_space.n, discountRate=discountRate) for trialNumber in range(trainingIterations): observation = env.reset() reward = 0 qlearner.clear_record() for i in range(200): currentState = Assignment7Support.MountainCarObservationToStateSpace(observation, mountainCarBinsPerDimension) action = qlearner.GetAction(currentState, learningMode=True, randomActionRate=randomActionRate, actionProbabilityBase=actionProbabilityBase) oldState = Assignment7Support.MountainCarObservationToStateSpace(observation, mountainCarBinsPerDimension) observation, reward, isDone, info = env.step(action) newState = Assignment7Support.MountainCarObservationToStateSpace(observation, mountainCarBinsPerDimension) # learning rate scale qlearner.ObserveAction(oldState, action, newState, reward, learningRateScale=learningRateScale) if use_memory: qlearner.record(oldState, action, newState, reward) if isDone: if use_memory: qlearner.replay(learningRateScale) # if (trialNumber + 1) % 1000 == 0: # print(trialNumber + 1, i + 1, np.min(qlearner.q_table), np.mean(qlearner.q_table)) break n = 20 totalRewards = [] for runNumber in range(n): observation = env.reset() totalReward = 0 reward = 0 for i in range(200): if render: renderDone = env.render() currentState = Assignment7Support.MountainCarObservationToStateSpace(observation, mountainCarBinsPerDimension) observation, reward, isDone, info = env.step(qlearner.GetAction(currentState, learningMode=False)) totalReward += reward if isDone: if render: renderDone = env.render() # print(runNumber + 1, i + 1, totalReward) totalRewards.append(totalReward) break if render: env.close() average_score = sum(totalRewards) / float(len(totalRewards)) print(f'[{datetime.datetime.now()}] The average score of this one attempt is {average_score}') return average_score
def main(): mdp = MarkovDecisionProblem.MarkovDecisionProblem() vi = ValueIteration.ValueIteration(mdp) ql = QLearning.QLearning(mdp) ql.qlearning(iterations=15, exploration=0.2)
def test_cartpole(): cartpole = QLearning.CartPole() policy_filename = "policy/policy_9600.npy" env = gym.make('CartPole-v0') env = env.unwrapped env = Monitor(env, "video", force=True) episodes = 1000 step = 20000 my_test(policy_filename, env, episodes, step, cartpole.discrete_util)
def test_mountaincar(): mountaincar = QLearning.MountainCar() policy_filename = "policy/policy_5300.npy" env = gym.make('MountainCar-v0') env = env.unwrapped #env = Monitor(env, "video", force=True) episodes = 1000 step = 2000 my_test(policy_filename, env, episodes, step, mountaincar.discrete_util)
def test_acrobot(): acrobot = QLearning.Acrobot() policy_filename = "policy/policy_9900.npy" env = gym.make('Acrobot-v1') env = env.unwrapped #env = Monitor(env, "video", force=True) episodes = 1000 step = 2000 my_test(policy_filename, env, episodes, step, acrobot.discrete_util)
def test(): '''Create the MDP, then run an episode of random actions for 10 steps.''' initial = Peg.Initial_state state = Peg.State(initial) learn = QLearning.Q(state) learn.register_R(Peg.R) learn.register_all_moves(Peg.all_possible_moves) learn.register_pegs(Peg.number_of_pegs) learn.QLearning(0.9, 100, 0.05) print(QValues_string(learn.Q))
def get_alg(name, args, env): if name == "oracleq": alg_dict = { 'horizon': args.horizon, 'alpha': args.lr, 'conf': args.conf } alg = OracleQ.OracleQ(env.action_space.n, params=alg_dict) elif name == "neural-e3": alg_dict = { 'horizon': args.horizon, 'dimension': args.dimension, 'n_hidden': args.n_hidden, 'n_ensemble': args.n_ensemble, 'n_playouts': args.n_playouts, 'n_samples': args.n_samples, 'n_model_updates': args.n_model_updates, 'ucb_c': args.ucb_c, 'anchor': args.anchor, 'lr': args.lr, 'batch_size': 100, 'n_actions': env.action_space.n, 'conf': args.conf } alg = NeuralE3.NeuralE3(env.action_space.n, params=alg_dict) elif name == 'decoding': alg_dict = { 'horizon': env.horizon, 'model_type': args.model_type, 'n': args.n, 'num_cluster': args.num_cluster } alg = Decoding.Decoding(env.observation_space.n, env.action_space.n, params=alg_dict) elif name == 'uniform': alg_dict = { 'horizon': env.horizon, 'model_type': args.model_type, 'n_actions': env.action_space.n, 'n': args.n, 'num_cluster': args.num_cluster } alg = Uniform.Uniform(env.action_space.n, params=alg_dict) elif name == 'qlearning': assert args.tabular, "[EXPERIMENT] Must run QLearning in tabular mode" alg_dict = { 'alpha': float(args.lr), 'epsfrac': float(args.epsfrac), 'num_episodes': int(args.episodes) } alg = QLearning.QLearning(env.action_space.n, params=alg_dict) return (alg)
def ql(args): if len(args) >= 2: model = parse_model(args[0]) num_steps = int(args[1]) if len(args) >= 3: record_file = open(args[2], 'w') else: record_file = None if len(args) >= 4: learning_rate = float(args[3]) else: learning_rate = None if len(args) >= 5: discount_rate = float(args[4]) else: discount_rate = None if len(args) >= 6: e = float(args[5]) else: e = None if learning_rate != None and discount_rate != None and e != None: ql = QLearning(model, learning_rate, discount_rate, e) elif learning_rate != None and discount_rate != None: ql = QLearning(model, learning_rate, discount_rate) elif learning_rate != None: ql = QLearning(model, learning_rate) else: ql = QLearning(model) if record_file != None: run(ql, num_steps, record_file) else: run(ql, num_steps) else: invalid()
def initializeQLearning(self, Q=None): myQLearning = QLearning.QLearning(self.MDP, \ self.Agent, \ self.alpha, \ self.gamma, \ self.epsilon, \ self.epsilonIncrement, \ 1, \ self.H, \ Q = Q, \ gammaPRQL = self.gammaPRQL) return myQLearning
def test_1(self): qlearner = QLearning.QLearning([2, 2], 2, 0.9) print("action 1") qlearner.ObserveAction([0, 0], 1, [0, 1], 1, learningRateScale=1.0) print(qlearner.Q) print("visit") print(qlearner.visits) print("action 2") qlearner.ObserveAction([0, 0], 1, [0, 1], 1, learningRateScale=1.0) print(qlearner.Q) print("visit") print(qlearner.visits)
def set_player(args, color, s): if args is 1: _players = P.Player() elif args is 2: _players = R.RandomAI() elif args is 3: p_color = color # 学習したデータがあればそれを読み込む if os.path.exists('./data/{}_move_4x4.pickle'.format(s)): with open('./data/{}_move_4x4.pickle'.format(s), 'rb') as f: print('ローディング中... ( 思った以上に時間がかかります )') _players = pickle.load(f) print('完了!!') else: _players = Q.QLearning(color) return _players
def __init__(self, n, ysize, xsize, erate, eps, gamma, alpha, maxEpisodes, maxSteps, touch, capture): """ Persuitの初期化 :param erate: 獲物が移動に失敗する率 """ self.numOfAgents = n self.erate = erate self.dif = [[0,-1],[0,1],[-1,0],[1,0],[0,0]] self.ysize = ysize self.xsize = xsize self.touch = touch self.capture = capture dim = [self.ysize, self.xsize] for i in range(n): dim = dim + [self.ysize, self.xsize] self.ql = QLearning.QLearning(dim, pow(5,n), eps, gamma, alpha, maxEpisodes, maxSteps, self.inif, self.act, self.checkg, [touch, capture] ) self.ql.alabel = self.mkActLabel(n) # 辞書ではなく配列に変更
def __init__(self): super(CollectMineralsAndGas, self).__init__() data = ['SP Att.', 'SP Att.F', '#Depots', 'Ref. Att.', 'Ref. Att.F', '#Refineries', 'CC Att.', 'CC Att.f', 'Mins', 'Gas', 'IdleSCVs', 'CCs', 'Supply', 'Score'] with open('/home/kenn/Development/sc2-bot/CustomAgents/scores.txt', 'w+') as f: f.write('{0[0]:<15}{0[1]:<15}{0[2]:<15}{0[3]:<15}{0[4]:<15}{0[5]:<15}{0[6]:<15}{0[7]:<15}{0[8]:<15}{0[9]:<15}{0[10]:<15}{0[11]:<15}{0[12]:<15}{0[13]:<15}\n'.format(data)) self.qlearn = QLearning.QLearningTable(actions=list(range(len(smart_actions)))) self.move_number = 0 self.previous_action = None self.previous_state = None self.unit_type = None self.supply_depots = 0 self.refineries = 0 self.previous_collected_minerals_rate = 0 self.previous_collected_vespene_rate = 0 self.build_supply_depot_attempts = 0 self.build_supply_depot_attempts_failed = 0 self.build_cc_attempts = 0 self.build_cc_attempt_failed = 0 self.build_refinery_attempts = 0 self.build_refinery_attempts_failed = 0 self.builder_iterator = 0 self.invocations = 0 self.initializing = 0 if os.path.isfile(DATA_FILE + '.gz'): self.qlearn.q_table = pd.read_pickle(DATA_FILE + '.gz', compression='gzip')
def main(maxGames, gamma, epsilon, bird_has_learned, q_values_counter): """The application's entry point. If someone executes this module (instead of importing it, for example), this function is called. """ counter = 0 QL = QLearning.Qvalue(gamma) if bird_has_learned==1 : QL.Q = q_values_counter reward = 10 reward_die = -1000 reward_pass = 1 reward_ingap = 200 scoreList = [] avgScore = [] filename_prefix = './q-attempt-auto-' filename = filename_prefix + str(gamma) + '-' + str(epsilon) + '.txt' f = open(filename, 'w+') pygame.init() while counter < maxGames: episode = [] display_surface = pygame.display.set_mode((WIN_WIDTH, WIN_HEIGHT)) pygame.display.set_caption('Pygame Flappy Bird') clock = pygame.time.Clock() score_font = pygame.font.SysFont(None, 32, bold=True) # default font images = load_images() # the bird stays in the same x position, so bird.x is a constant # center bird on screen bird = Bird(50, int(WIN_HEIGHT/2 - Bird.HEIGHT/2), 2, (images['bird-wingup'], images['bird-wingdown'])) pipes = deque() nextPipes = deque() agent_y = None agent_status = True time_taken = [] ActionList = [] lastPipes = 0 fcounter = 0 frame_clock = 0 # this counter is only incremented if the game isn't paused score = 0 done = paused = False while not done: clock.tick(FPS) # Handle this 'manually'. If we used pygame.time.set_timer(), # pipe addition would be messed up when paused. if not (paused or frame_clock % msec_to_frames(PipePair.ADD_INTERVAL)): pp = PipePair(images['pipe-end'], images['pipe-body']) pipes.append(pp) nextPipes.append(pp) for e in pygame.event.get(): if e.type == QUIT or (e.type == KEYUP and e.key == K_ESCAPE): done = True break elif e.type == KEYUP and e.key in (K_PAUSE, K_p): paused = not paused elif e.type == MOUSEBUTTONUP or (e.type == KEYUP and e.key in (K_UP, K_RETURN, K_SPACE)): bird.msec_to_climb = Bird.CLIMB_DURATION ############################### RL CODE #################################################### ###################################################################################################### ####### QLearning ###################################################################################################### if (fcounter%(FPS/4) == 0): newState = QLearning.QLState(bird,pipes) if bird_has_learned==1: newAction = QLearning.epsilon_greedy(QL, 0.0, newState) else: newAction = QLearning.epsilon_greedy(QL, min(0.1, epsilon/float(counter+1)), newState) if newAction == 'jump': bird.msec_to_climb = Bird.CLIMB_DURATION episode.append((newState.short(),newAction)) fcounter+=1 if paused: continue # don't draw anything # check for collisions pipe_collision = any(p.collides_with(bird) for p in pipes) if pipe_collision or 0 >= bird.y or bird.y >= WIN_HEIGHT - Bird.HEIGHT: done = True for x in (0, WIN_WIDTH / 2): display_surface.blit(images['background'], (x, 0)) ############################## display predicted path ################### # for state in predState: # display_surface.blit(state.bird.image,state.bird.rect) # predState.pop(0) ########################################################################## while pipes and not pipes[0].visible: pipes.popleft() for p in pipes: p.update() display_surface.blit(p.image, p.rect) bird.update() display_surface.blit(bird.image, bird.rect) # update and display score for p in pipes: if p.x + PipePair.WIDTH < bird.x and not p.score_counted: score += 1 p.score_counted = True nextPipes.popleft() score_surface = score_font.render(str(score), True, (255, 255, 255)) score_x = WIN_WIDTH/2 - score_surface.get_width()/2 display_surface.blit(score_surface, (score_x, PipePair.PIECE_HEIGHT)) pygame.display.flip() frame_clock += 1 if bird_has_learned != 1: for i in range(len(episode)-2): if episode[i+1][0][1] >= 0 and episode[i+1][0][1] <= 3: QL.update(episode[i][0],episode[i][1],reward_ingap,episode[i+1][0],counter) else: QL.update(episode[i][0],episode[i][1],reward,episode[i+1][0],counter) QL.update(episode[len(episode)-2][0],episode[len(episode)-2][1],reward_die,episode[len(episode)-1][0],counter) print('Game over! Score: %i\tnum states:%i\tnum games:%i' % (score, len(QL.Q), counter))# print(QL.Q) counter+=1 if len(avgScore) == 0: avgScore.append(score) else: avgScore.append((avgScore[-1]*(counter-1)+ score)/float(counter)) scoreList.append(score) pygame.quit() print(scoreList) print(avgScore) f.write(str(avgScore)) f.write('\n') f.write(str(scoreList)) f.write('\n') f.write(str(QL.Q)) f.write('\n')
def main(maxGames, gamma, epsilon, bird_has_learned, q_values_counter): """The application's entry point. If someone executes this module (instead of importing it, for example), this function is called. """ counter = 0 QL = QLearning.Qvalue(gamma) if bird_has_learned == 1: QL.Q = q_values_counter reward = 10 reward_die = -1000 reward_pass = 1 reward_ingap = 200 scoreList = [] avgScore = [] filename_prefix = './q-attempt-auto-' filename = filename_prefix + str(gamma) + '-' + str(epsilon) + '.txt' f = open(filename, 'w+') pygame.init() while counter < maxGames: episode = [] display_surface = pygame.display.set_mode((WIN_WIDTH, WIN_HEIGHT)) pygame.display.set_caption('Pygame Flappy Bird') clock = pygame.time.Clock() score_font = pygame.font.SysFont(None, 32, bold=True) # default font images = load_images() # the bird stays in the same x position, so bird.x is a constant # center bird on screen bird = Bird(50, int(WIN_HEIGHT / 2 - Bird.HEIGHT / 2), 2, (images['bird-wingup'], images['bird-wingdown'])) pipes = deque() nextPipes = deque() agent_y = None agent_status = True time_taken = [] ActionList = [] lastPipes = 0 fcounter = 0 frame_clock = 0 # this counter is only incremented if the game isn't paused score = 0 done = paused = False while not done: clock.tick(FPS) # Handle this 'manually'. If we used pygame.time.set_timer(), # pipe addition would be messed up when paused. if not (paused or frame_clock % msec_to_frames(PipePair.ADD_INTERVAL)): pp = PipePair(images['pipe-end'], images['pipe-body']) pipes.append(pp) nextPipes.append(pp) for e in pygame.event.get(): if e.type == QUIT or (e.type == KEYUP and e.key == K_ESCAPE): done = True break elif e.type == KEYUP and e.key in (K_PAUSE, K_p): paused = not paused elif e.type == MOUSEBUTTONUP or (e.type == KEYUP and e.key in (K_UP, K_RETURN, K_SPACE)): bird.msec_to_climb = Bird.CLIMB_DURATION ############################### RL CODE #################################################### ###################################################################################################### ####### QLearning ###################################################################################################### if (fcounter % (FPS / 4) == 0): newState = QLearning.QLState(bird, pipes) if bird_has_learned == 1: newAction = QLearning.epsilon_greedy(QL, 0.0, newState) else: newAction = QLearning.epsilon_greedy( QL, min(0.1, epsilon / float(counter + 1)), newState) if newAction == 'jump': bird.msec_to_climb = Bird.CLIMB_DURATION episode.append((newState.short(), newAction)) fcounter += 1 if paused: continue # don't draw anything # check for collisions pipe_collision = any(p.collides_with(bird) for p in pipes) if pipe_collision or 0 >= bird.y or bird.y >= WIN_HEIGHT - Bird.HEIGHT: done = True for x in (0, WIN_WIDTH / 2): display_surface.blit(images['background'], (x, 0)) ############################## display predicted path ################### # for state in predState: # display_surface.blit(state.bird.image,state.bird.rect) # predState.pop(0) ########################################################################## while pipes and not pipes[0].visible: pipes.popleft() for p in pipes: p.update() display_surface.blit(p.image, p.rect) bird.update() display_surface.blit(bird.image, bird.rect) # update and display score for p in pipes: if p.x + PipePair.WIDTH < bird.x and not p.score_counted: score += 1 p.score_counted = True nextPipes.popleft() score_surface = score_font.render(str(score), True, (255, 255, 255)) score_x = WIN_WIDTH / 2 - score_surface.get_width() / 2 display_surface.blit(score_surface, (score_x, PipePair.PIECE_HEIGHT)) pygame.display.flip() frame_clock += 1 if bird_has_learned != 1: for i in range(len(episode) - 2): if episode[i + 1][0][1] >= 0 and episode[i + 1][0][1] <= 3: QL.update(episode[i][0], episode[i][1], reward_ingap, episode[i + 1][0], counter) else: QL.update(episode[i][0], episode[i][1], reward, episode[i + 1][0], counter) QL.update(episode[len(episode) - 2][0], episode[len(episode) - 2][1], reward_die, episode[len(episode) - 1][0], counter) print('Game over! Score: %i\tnum states:%i\tnum games:%i' % (score, len(QL.Q), counter)) # print(QL.Q) counter += 1 if len(avgScore) == 0: avgScore.append(score) else: avgScore.append( (avgScore[-1] * (counter - 1) + score) / float(counter)) scoreList.append(score) pygame.quit() print(scoreList) print(avgScore) f.write(str(avgScore)) f.write('\n') f.write(str(scoreList)) f.write('\n') f.write(str(QL.Q)) f.write('\n')
def get_learner(algorithm, model): if algorithm == "QLearning": if model.name == "SlipperyChain": return QLearning(model) elif model.name == "Loop": return QLearning(model) elif model.name == "LoopDeadEnd": return QLearning(model,0.2,0.99,0.001,0.999) elif model.name == "LoopDiffTrans": return QLearning(model,0.3,0.2,0.2,0.999) else: return QLearning(model) elif algorithm == "PrioritizedSweeping": if model.name == "SlipperyChain": return PrioritizedSweeping(model,2,0.2,0.99,0.9) elif model.name == "Loop": return PrioritizedSweeping(model,2,0.999,0.99,0.9) elif model.name == "LoopDeadEnd": return PrioritizedSweeping(model,5,0.999,0.99,0.9) elif model.name == "LoopDiffTrans": return PrioritizedSweeping(model,5,0.8,0.99,0.9) else: return PrioritizedSweeping(model) elif algorithm == "PrioritizedSweepingPolicy": return PrioritizedSweepingPolicy(model) elif algorithm == "PrioritizedSweepingHeuristics": if model.name == "SlipperyChain": return PrioritizedSweepingHeuristics(model,2,0.9,0.99,0.9) elif model.name == "Loop": return PrioritizedSweepingHeuristics(model,2,0.9,0.99,0.9) elif model.name == "LoopDeadEnd": return PrioritizedSweepingHeuristics(model,1,0.999,0.999,0.999) elif model.name == "LoopDiffTrans": return PrioritizedSweepingHeuristics(model,5,0.9,0.99,0.9) else: return PrioritizedSweepingHeuristics(model) elif algorithm == "QLearn": return QLearn(model) elif algorithm == "PrioritizedQLearning": return PrioritizedQLearning(model) elif algorithm == "BayesDP": if model.name == "SlipperyChain": return BayesPrioritizedSweeping(model,10,0.9,1,0.2,20) elif model.name == "Loop": return BayesPrioritizedSweeping(model,10,0.9,1,0.2,20) elif model.name == "LoopDeadEnd": return BayesPrioritizedSweeping(model) elif model.name == "LoopDiffTrans": return BayesPrioritizedSweeping(model,1,0.2,1,0.2,20) else: return BayesPrioritizedSweeping(model) else: raise Exception(algorithm + " not found")
def main(): """The application's entry point. If someone executes this module (instead of importing it, for example), this function is called. """ counter = 0 maxGames = 500 QL = QLearning.Qvalue() # QL.Q = Counter({((7, 1), 'jump'): 7.857272463519327, ((7, 1), 'stay'): 7.741245285312653, ((9, 1), 'jump'): 6.974197946538579, ((9, 3), 'stay'): 2.4728502681599998, ((4, -2), 'jump'): 1.6991999999999998, ((7, 4), 'jump'): 1.656, ((4, -2), 'stay'): 1.6416, ((9, -2), 'jump'): 1.1663999999999999, ((4, 6), 'jump'): 0.96, ((2, -2), 'stay'): 0.9359999999999999, ((4, 6), 'stay'): 0.9359999999999999, ((2, 7), 'jump'): 0.6, ((7, -2), 'jump'): 0.6, ((5, 4), 'stay'): 0.6, ((9, -1), 'stay'): -80.54968253005099, ((9, 0), 'jump'): -97.33756762163051, ((5, 1), 'jump'): -99.28331164679123, ((-1, 2), 'jump'): -100.7318714988479, ((9, 0), 'stay'): -105.70532659645818, ((5, -1), 'stay'): -116.91400116117696, ((0, 1), 'stay'): -120.35252404640816, ((7, 0), 'stay'): -130.22648878053943, ((5, 1), 'stay'): -133.71809218288863, ((5, 0), 'jump'): -148.9099124363754, ((-3, 1), 'stay'): -154.4543708087079, ((7, 0), 'jump'): -156.57084541134142, ((5, 0), 'stay'): -162.0850310531728, ((4, 1), 'jump'): -171.2932661473332, ((4, 0), 'jump'): -179.7965693246642, ((-1, 2), 'stay'): -182.8699917288755, ((2, 1), 'jump'): -186.62179477555162, ((0, 0), 'jump'): -192.10570383523043, ((4, 0), 'stay'): -193.63696553046788, ((7, -2), 'stay'): -199.4327808, ((-1, 0), 'stay'): -202.9226452728635, ((-3, 0), 'jump'): -209.71099375511983, ((9, 2), 'jump'): -211.6669386541629, ((-1, 1), 'stay'): -216.87973668175724, ((0, 0), 'stay'): -217.877507717578, ((2, 0), 'stay'): -222.32663278238968, ((7, 2), 'stay'): -246.34051949020602, ((2, 1), 'stay'): -246.74031192502184, ((1, 0), 'jump'): -257.4947922630647, ((-1, 1), 'jump'): -257.66471443706297, ((1, 1), 'stay'): -262.18436199797634, ((1, 1), 'jump'): -268.6598699141706, ((-3, 1), 'jump'): -269.1352608305162, ((5, -2), 'stay'): -274.63086373166846, ((0, 1), 'jump'): -282.6151560499842, ((2, 0), 'jump'): -284.34034051851995, ((4, 5), 'jump'): -300.6624, ((9, 2), 'stay'): -345.8247595834438, ((9, -1), 'jump'): -348.47208661575024, ((5, 4), 'jump'): -356.20995840000006, ((2, 6), 'stay'): -359.15999999999997, ((2, 6), 'jump'): -359.4, ((9, 1), 'stay'): -359.8062183772605, ((4, -1), 'stay'): -400.5373431989693, ((7, -1), 'stay'): -415.27671308280446, ((9, 3), 'jump'): -466.85968987967163, ((9, -2), 'stay'): -481.6451347036986, ((-1, 0), 'jump'): -505.0351877771873, ((2, -2), 'jump'): -560.6256, ((4, 5), 'stay'): -576.9926399999999, ((1, 6), 'jump'): -600.0, ((1, 7), 'stay'): -600.0, ((1, -2), 'jump'): -600.0, ((7, -1), 'jump'): -603.1319891393689, ((-3, 0), 'stay'): -638.8181874316656, ((-3, 2), 'stay'): -665.6193584360807, ((4, 1), 'stay'): -706.3801965210189, ((-3, 2), 'jump'): -711.8758049693784, ((5, -1), 'jump'): -736.2986304823946, ((4, 4), 'stay'): -800.0267886182401, ((2, -1), 'stay'): -804.4422906834702, ((1, 0), 'stay'): -838.2476058673772, ((1, -3), 'stay'): -840.0, ((4, -1), 'jump'): -852.0242460274835, ((7, 4), 'stay'): -860.9277966754564, ((7, 3), 'stay'): -865.0574039234211, ((4, 4), 'jump'): -865.9002218495999, ((7, 3), 'jump'): -878.6417536488161, ((2, 5), 'jump'): -888.9302400000001, ((2, 5), 'stay'): -888.9456, ((4, 3), 'stay'): -934.4293978218991, ((1, 6), 'stay'): -936.0, ((2, 4), 'stay'): -956.7407422463999, ((5, 3), 'jump'): -972.1059646694266, ((1, -2), 'stay'): -974.4000000000001, ((1, 5), 'jump'): -974.4000000000001, ((7, 2), 'jump'): -984.7275651949986, ((5, 2), 'jump'): -988.945812073332, ((5, 3), 'stay'): -992.5832848500062, ((5, 2), 'stay'): -995.5538804111354, ((1, 5), 'stay'): -995.904, ((2, -1), 'jump'): -997.512428804205, ((4, 2), 'jump'): -997.6603770175502, ((4, 2), 'stay'): -997.9212840472369, ((4, 3), 'jump'): -997.9374335860218, ((2, 4), 'jump'): -998.88964786176, ((2, 2), 'stay'): -998.9614614721633, ((2, 3), 'jump'): -998.9974237927165, ((2, 2), 'jump'): -998.9982333417834, ((2, 3), 'stay'): -998.9994806521587, ((1, -1), 'jump'): -999.95805696, ((1, 4), 'stay'): -999.9932891136, ((1, 4), 'jump'): -999.998926258176, ((1, 2), 'stay'): -999.9997408910938, ((1, 3), 'jump'): -999.9999725122093, ((1, 2), 'jump'): -999.9999992963126, ((1, 3), 'stay'): -999.999999718525, ((1, -1), 'stay'): -1049.2908903268349}) reward = 10 reward_die = -1000 reward_pass = 10 scoreList = [] avgScore = [] pygame.init() while counter < maxGames: episode = [] display_surface = pygame.display.set_mode((WIN_WIDTH, WIN_HEIGHT)) pygame.display.set_caption('Pygame Flappy Bird') clock = pygame.time.Clock() score_font = pygame.font.SysFont(None, 32, bold=True) # default font images = load_images() # the bird stays in the same x position, so bird.x is a constant # center bird on screen bird = Bird(50, int(WIN_HEIGHT/2 - Bird.HEIGHT/2), 2, (images['bird-wingup'], images['bird-wingdown'])) pipes = deque() nextPipes = deque() agent_y = None agent_status = True time_taken = [] ActionList = [] lastPipes = 0 fcounter = 0 frame_clock = 0 # this counter is only incremented if the game isn't paused score = 0 done = paused = False while not done: clock.tick(FPS) # Handle this 'manually'. If we used pygame.time.set_timer(), # pipe addition would be messed up when paused. if not (paused or frame_clock % msec_to_frames(PipePair.ADD_INTERVAL)): pp = PipePair(images['pipe-end'], images['pipe-body']) pipes.append(pp) nextPipes.append(pp) for e in pygame.event.get(): if e.type == QUIT or (e.type == KEYUP and e.key == K_ESCAPE): done = True break elif e.type == KEYUP and e.key in (K_PAUSE, K_p): paused = not paused elif e.type == MOUSEBUTTONUP or (e.type == KEYUP and e.key in (K_UP, K_RETURN, K_SPACE)): bird.msec_to_climb = Bird.CLIMB_DURATION ############################### RL CODE #################################################### ###################################################################################################### ####### QLearning ###################################################################################################### if (fcounter%(FPS/8) == 0): newState = QLearning.QLState(bird,pipes) if counter%10 == 0: newAction = QLearning.epsilon_greedy(QL,0,newState) else: newAction = QLearning.epsilon_greedy(QL,min(0.6,10/math.sqrt(counter+1)),newState) if newAction == 'jump': bird.msec_to_climb = Bird.CLIMB_DURATION episode.append((newState.short(),newAction)) fcounter+=1 if paused: continue # don't draw anything # check for collisions pipe_collision = any(p.collides_with(bird) for p in pipes) if pipe_collision or 0 >= bird.y or bird.y >= WIN_HEIGHT - Bird.HEIGHT: done = True for x in (0, WIN_WIDTH / 2): display_surface.blit(images['background'], (x, 0)) ############################## display predicted path ################### # for state in predState: # display_surface.blit(state.bird.image,state.bird.rect) # predState.pop(0) ########################################################################## while pipes and not pipes[0].visible: pipes.popleft() for p in pipes: p.update() display_surface.blit(p.image, p.rect) bird.update() display_surface.blit(bird.image, bird.rect) # update and display score for p in pipes: if p.x + PipePair.WIDTH < bird.x and not p.score_counted: score += 1 p.score_counted = True nextPipes.popleft() score_surface = score_font.render(str(score), True, (255, 255, 255)) score_x = WIN_WIDTH/2 - score_surface.get_width()/2 display_surface.blit(score_surface, (score_x, PipePair.PIECE_HEIGHT)) pygame.display.flip() frame_clock += 1 for i in range(len(episode)-2): if episode[i+1][0][1] >= 0 and episode[i+1][0][1] <= 3: QL.update(episode[i][0],episode[i][1],reward_pass,episode[i+1][0],counter) else: QL.update(episode[i][0],episode[i][1],reward,episode[i+1][0],counter) QL.update(episode[len(episode)-2][0],episode[len(episode)-2][1],reward_die,episode[len(episode)-1][0],counter) print('Game over! Score: %i' % score) # print(QL.Q) counter+=1 print(counter) if (counter-1) == 0: avgScore.append(score) elif((counter-1)%10 == 0): avgScore.append(avgScore[-1]*(counter-1)/counter + score/counter) if (counter-1)%10==0: scoreList.append(score)
def training_one(runs_index): qlearner = QLearning.QLearning( stateSpaceShape=Assignment7Support.CartPoleStateSpaceShape(), numActions=env.action_space.n, discountRate=discountRate) print( f'[{datetime.datetime.now()}] Start training, runs id {runs_index + 1}' ) for trialNumber in range(trainingIterations): observation = env.reset() reward = 0 for i in range(300): #env.render() currentState = Assignment7Support.CartPoleObservationToStateSpace( observation) action = qlearner.GetAction( currentState, learningMode=True, randomActionRate=randomActionRate, actionProbabilityBase=actionProbabilityBase) oldState = Assignment7Support.CartPoleObservationToStateSpace( observation) observation, reward, isDone, info = env.step(action) newState = Assignment7Support.CartPoleObservationToStateSpace( observation) qlearner.ObserveAction(oldState, action, newState, reward, learningRateScale=learningRateScale) if isDone: # if (trialNumber + 1) % 1000 == 0: # print(trialNumber + 1, i + 1, np.max(qlearner.q_table), np.mean(qlearner.q_table)) break print( f'[{datetime.datetime.now()}] End of the traininig, runs id {runs_index + 1}' ) ## Now do the best n runs I can # input('Enter to continue...') n = 20 totalRewards = [] for runNumber in range(n): observation = env.reset() totalReward = 0 reward = 0 for i in range(300): # renderDone = env.render() currentState = Assignment7Support.CartPoleObservationToStateSpace( observation) observation, reward, isDone, info = env.step( qlearner.GetAction(currentState, learningMode=False)) totalReward += reward if isDone: # renderDone = env.render() # print(runNumber + 1, i + 1, totalReward) totalRewards.append(totalReward) break # env.close() average_score = sum(totalRewards) / float(len(totalRewards)) print( f'[{datetime.datetime.now()}] End of the Test, runs id {runs_index + 1}' ) print(f'runs id {runs_index + 1}, {totalRewards}') print(f'Your Score: {average_score}, runs id {runs_index + 1}') return average_score
if opt == '-h': usage() sys.exit() elif opt in ("-l"): print "-l seen" levelfile = arg elif opt in ("-k"): k = int(arg) elif opt in ("-a"): a = arg elif opt in ("-y"): y = arg elif opt in ("-m"): m = arg elif opt in ("-t"): t = arg elif opt in ("-x"): x = arg try: flatland = Flatland.Flatland(0, levelfile) except: print "problem loading level file" usage() sys.exit(2) q = QLearning.QLearning(k, levelfile, a, y, m, t, x) q.learn() q.run(False, [1.0], True) while 1: q.run(True, [1.0], True)
from QLearning import * from ChainModel import * ps = QLearning(SlipperyChainModel(0.1), 0.5, 0.8, 0) for i in range(10000): print ps.next() # expect state 5 to have the highest potential # for state in ps.model.states: # print ps.get_v(state) # for i in range(1, 6): # print "transition model" # print ps.get_transition_table(ps.model.state[i], ps.model.act_a) # print ps.get_transition_table(ps.model.state[i], ps.model.act_b) # print "reward model" # print ps.get_reward_table(ps.model.state[i], ps.model.act_a) # print ps.get_reward_table(ps.model.state[i], ps.model.act_b)
createTransitionProbabilityDict = Transition.CreateTransitionProbabilityDict( transitionFunction) transitionFromStateAndAction = Transition.TransitionFromStateAndAction( worldRange) transitionProbabilityDict = createTransitionProbabilityDict( stateList, actionList) createRewardDict = Reward.MultiTargetsRewardDict(stateList, actionList, targetReward) runValueIteration = ValueIteration.ValueIteration(stateList, actionList, decayRate, convergeThreshold, maxIterationStep) createPolicyFromValue = ValueIteration.PolicyFromValue( stateList, actionList, decayRate) runQLearning = QLearning.QLearning(alpha, gamma, epsilon, segmentTotalNumber, stateList, actionList, transitionFromStateAndAction) print('finish setting function', time.time() - time0) trainWolfPolicy = TrainWolfPolicyValueIteration(stateList, transitionProbabilityDict, createRewardDict, runValueIteration, createPolicyFromValue) # trainWolfPolicy = TrainWolfPolicyQLearning(stateList, createRewardDict, runQLearning) wolfPolicy = trainWolfPolicy() # print(wolfPolicy) print('finish training policy', time.time() - time0) print('begin saving policy, please wait') Writer.savePolicyToPkl(wolfPolicy, savePolicyFilename)
from time import sleep import Tictactoe import Tablero import Jugador import QLearning ai = QLearning.QLearning() ai2 = QLearning.QLearning() jugador1 = Jugador.Jugador("player1", 'X', ai) jugador2 = Jugador.Jugador("player2", 'O', None) jugadores = [] jugadores.append(jugador1) jugadores.append(jugador2) tablero = Tablero.Tablero() game = Tictactoe.tictactoe(tablero, jugadores) game.encender() #while True: # game.new_game() # if game.iterations == 3: # game.new_game(t="q") # break
axis = 0) next_states_377 = np.concatenate( (next_states_377_train, next_states_377_test), axis = 0) actions_377 = np.concatenate( (actions_377_train, actions_377_test), axis = 0) rewards_377 = np.concatenate( (rewards_377_train, rewards_377_test), axis = 0) trajectories_377 = trajectories_377_train + trajectories_377_test print '-------------------Evaluation for Q377--------------------------------' # evaluation for q377 state_rewards_377 = ql.estimate_rewards(next_states_377_train, actions_377_train, rewards_377_train, action_q377) discounted_rewards_377 = ql.discount_rewards(state_rewards_377, discount) discounted_max_states_377 = ql.get_max_reward_states(discounted_rewards_377) max_states_377 = ql.get_max_reward_states(state_rewards_377) q377_policy = QLearnedPolicy(discounted_max_states_377[0], q377_labels) print 'Reward of max state = {a}, discounted max state = {b}'.format( a = state_rewards_377[max_states_377[0]], b = state_rewards_377[discounted_max_states_377[0]]) print 'Discounted reward of max state = {a}, discounted max state = {b}'.format( a = discounted_rewards_377[max_states_377[0]], b = discounted_rewards_377[discounted_max_states_377[0]]) print 'Max state actions: {a} \nDiscounted max state actions: {b}'.format( a = q377_labels[np.array(max_states_377[0]).astype(int) == 1], b = q377_labels[np.array(discounted_max_states_377[0]).astype(int) == 1]) action_counts_377 = defaultdict(lambda: defaultdict(int)) for s,a in zip(states_377,actions_377): action_counts_377[tuple(s)][a] += 1
return unmapper([cord[0] - 1, cord[1]], dim) elif action == 2: return unmapper([cord[0], cord[1] - 1], dim) else: return unmapper([cord[0] + 1, cord[1]], dim) R0 = np.zeros([36, 4]) R0[32, 0] = 100 learning = QLearning(range(0, 36), range(0, 4), R0, 0.5, 1) state = 23 for i in range(0, 10000): action = learning.choose_action(state, 1 - float(i) / 100) next_state = apply_model(state, action) learning.update_model(state, action, next_state) if mod(i, 100) == 0: state = 23 else: state = next_state plt.clf() gw.DrawMap(np.reshape(np.max(learning.Q, 1), [6, 6]), model)
return unmapper([cord[0] - 1, cord[1]], dim) elif action == 2: return unmapper([cord[0], cord[1] - 1], dim) else: return unmapper([cord[0] + 1, cord[1]], dim) R0 = np.zeros([dim[0] * dim[1], 4]) goal = find_free(model) R0[goal, 0] = 100 learning = QLearning(range(0, dim[0] * dim[1]), range(0, 4), R0, 0.5, 1) s0 = goal while s0 == goal: s0 = find_free(model) plt.close() plt.figure(figsize=(dim[1] / 2, dim[0]), facecolor='w') for _ in range(100): preview = np.zeros(dim) state = s0
ma.update_state(card3, third) ################################################################## wp = ma.winning_card(ma.stack) points_dict[played[wp]] += ma.eval_stack(ma.stack) ma.clear_state() return points_dict if __name__ == "__main__": #player can be anything from 1_1, 2_2, 3_3, random, LW, LB, LBB play_modes = ["1_1", "2_2", "3_3", "random", "LW", "LB", "LWW", "LBB"] qlearning = QLearning(0.1, 0.1, 0.1) for mode1 in play_modes: for mode2 in play_modes: for mode3 in play_modes: print(f"Working on: {mode1} {mode2} {mode3}") count = 0 file = open(mode1 + " " + mode2 + " " + mode3, "w") s = "" while count < 1000: print(count / 1000) #init data to store into file data = dict()
def main(): Default = 0 QLearning = 1 Genetic = 2 C.initialize() #Initialize pygame and window surface. pygame.init() win = pygame.display.set_mode((C.WINDOW_WIDTH, C.WINDOW_HEIGHT)) pygame.display.set_caption("Asteroids Genetic Algorithm") timer = pygame.time.Clock() #Initialize Q-Learning. if MODE == QLearning: QTRAINING = True Q.Q_Matrix = Q.initialize() actiontimer = 0 action = 0 currentaction = 0 oldstateval = 0 oldscore = 0 prevQscore = 0 #Initialize level one asteroids (). LEVEL = 1 asteroids = [] asteroids = generateAsteroids(asteroids, LEVEL) #Initialize scoreboard. SCORE = 0 if C.DISPLAY_GAME: font = pygame.font.Font('Vector_Battle.ttf', 24) font.set_bold(True) show_score = font.render('SCORE: 0', True, C.WHITE, C.BLACK) scoreboard = show_score.get_rect() scoreboard.center = (150, 50) #Initialize player sprite. player = Player(C.WINDOW_WIDTH / 2, C.WINDOW_HEIGHT / 2, 0) if C.DISPLAY_GAME: ship = pygame.image.load(player.IMAGE) ship = pygame.transform.rotate(ship, -90) ship = pygame.transform.scale(ship, (C.PLAYERSIZE, C.PLAYERSIZE)) #Initialize state display. show_state = font.render('State: ' + ' '.join(player.state), True, C.WHITE, C.BLACK) statedisplay = show_state.get_rect() statedisplay.center = (535, 150) #Initialize projectiles. projectiles = [] #Initialize timers. respawntime = 0 run = True if MODE == Genetic: population = [GA.random_chromosome() for _ in range(GA.PopulationSize)] fitness_scores = [0 for i in range(GA.PopulationSize)] for each in range(len(population)): fitness_scores[each] = simulate(newGameContainer(), population[each]) print(fitness_scores[each]) average = GA.average_fitness(fitness_scores) print("avg fitness: " + str(average)) i = 0 while i < GA.NumIterations: i += 1 population, fitness_scores = GA.breed(population, fitness_scores) average = GA.average_fitness(fitness_scores) print("avg-fitness: " + str(average)) best_chromosome = population[GA.best_solution(fitness_scores)] while run: for event in pygame.event.get(): if event.type == pygame.QUIT: run = False #If using Q-Learning, train the Q-Matrix when the action timer runs out. if MODE == QLearning: actiontimer += 1 if actiontimer == C.FRAMES_PER_ACTION: actiontimer = 0 reward = SCORE - oldscore oldscore = SCORE nextbest = Q.Q_Matrix[C.state.index( player.state)][Q.greedy_choice(player.state)] Q.Q_Matrix[oldstateval][action] = prevQscore + C.stepsize * ( reward + C.discount * nextbest - prevQscore) oldstateval = C.state.index(player.state) action = Q.choose_action(player.state) prevQscore = Q.Q_Matrix[oldstateval][action] currentaction = C.actions[action] if MODE == Genetic: action = GA.updateAction(player, best_chromosome) executeAction(player, projectiles, action) #Choose an action, based on current key press or Q-Learning decision. keys = pygame.key.get_pressed() if (MODE == QLearning and currentaction == 'Left') or keys[pygame.K_LEFT]: player.rotation += 5 if (MODE == QLearning and currentaction == 'Right') or keys[pygame.K_RIGHT]: player.rotation -= 5 if (MODE == QLearning and currentaction == 'Thrust') or keys[pygame.K_UP]: if player.speed <= C.MAXSPEED: player.speed += C.THRUST del player.thrustvectors[0] player.thrustvectors.append([player.speed, player.rotation]) if (MODE == QLearning and currentaction == 'Shoot') or keys[pygame.K_SPACE]: if not player.firing: projectiles.append(fireProjectile(player)) player.firing = True if MODE == QLearning: if currentaction != 'Shoot': player.firing = False else: if not keys[pygame.K_SPACE]: player.firing = False #Update player, asteroids, projectiles, SCORE, LEVEL and state. rays = sense(player, asteroids) projectiles = detectProjectileColision(asteroids, projectiles) SCORE += updateScore(player, asteroids) player.score = SCORE updatePlayer(player) LEVEL = updateAsteroids(asteroids, LEVEL) updateProjectiles(projectiles) #Draw the game. if C.DISPLAY_GAME: drawGame(player, ship, asteroids, projectiles, scoreboard, SCORE, statedisplay, rays, font, win) timer.tick(C.FPS) if MODE != 2: pygame.quit() if C.SAVEQMATRIX: saveQmatrix(Q.Q_Matrix)
axis = 0) next_states_315 = np.concatenate( (next_states_315_train, next_states_315_test), axis = 0) actions_315 = np.concatenate( (actions_315_train, actions_315_test), axis = 0) rewards_315 = np.concatenate( (rewards_315_train, rewards_315_test), axis = 0) trajectories_315 = trajectories_315_train + trajectories_315_test # evaluation for q315 print '-------------------Evaluation for Q315--------------------------------' state_rewards_315 = ql.estimate_rewards(next_states_315_train, actions_315_train, rewards_315_train, action_q315) discounted_rewards_315 = ql.discount_rewards(state_rewards_315, discount) discounted_max_states_315 = ql.get_max_reward_states(discounted_rewards_315) max_states_315 = ql.get_max_reward_states(state_rewards_315) q315_policy = QLearnedPolicy(discounted_max_states_315[0], q315_labels) print 'Reward of max state = {a}, discounted max state = {b}'.format( a = state_rewards_315[max_states_315[0]], b = state_rewards_315[discounted_max_states_315[0]]) print 'Discounted reward of max state = {a}, discounted max state = {b}'.format( a = discounted_rewards_315[max_states_315[0]], b = discounted_rewards_315[discounted_max_states_315[0]]) print 'Max state actions: {a} \nDiscounted max state actions: {b}'.format( a = q315_labels[np.array(max_states_315[0]).astype(int) == 1], b = q315_labels[np.array(discounted_max_states_315[0]).astype(int) == 1]) action_counts_315 = defaultdict(lambda: defaultdict(int)) for s,a in zip(states_315,actions_315): action_counts_315[tuple(s)][a] += 1
if __name__ == "__main__": args = sys.argv # reading filename from command-line arguments argc = len(args) if (argc != 5): print( 'usage : $ python', args[0], 'maze_map(.txt file) episode(number) reward(0-1 float) punishment(0-1 float)' ) exit() n_episode = int(args[2]) # the number of episodes reward = float(args[3]) # reward punishment = float(args[4]) # punishment # fix seed for reproducibility(再現性) random.seed(0) np.random.seed(0) maze_map = Map.Map(args[1]) maze_map.printmap() # print maze (initial state) agent = Agent.Agent(maze_map) qtable = QTable.QTable(maze_map) qlearning = QLearning.QLearning(n_episode, reward, punishment) # execute QLearning qlearning.execute_qlearning(maze_map, agent, qtable) # printing result qtable.print_result(maze_map) #end of program
REFRESH_TIME = 40 SIZE = 480 tk_obj = Tk() canvas = Canvas(tk_obj, background="white", width=SIZE, height=SIZE) top_wall = canvas.create_rectangle((0, 0, SIZE, 10), fill="black") bot_wall = canvas.create_rectangle((0, SIZE-10, SIZE, SIZE), fill="black") computer = canvas.create_rectangle((SIZE-10, 240, SIZE, 144), fill="black") player = canvas.create_rectangle((0, 240, 10, 144), fill="black") ball = None x_velocity = 14.4 y_velocity = 4.8 test_games = 0 learned = False q_table = QLearning.get_table() for i in range (12): for j in range(12): print(q_table[i][j]) canvas.pack() reset_ball() tk_obj.bind("<KeyPress-Up>", move_up) tk_obj.bind("<KeyPress-Down>", move_down) tk_obj.after(REFRESH_TIME, refresh) tk_obj.mainloop()
import gym env = gym.make('CartPole-v0') import random import QLearning # Your implementation goes here... import Assignment7Support discountRate = 0.98 # Controls the discount rate for future rewards -- this is gamma from 13.10 actionProbabilityBase = 1.8 # This is k from the P(a_i|s) expression from section 13.3.5 and influences how random exploration is randomActionRate = 0.01 # Percent of time the next action selected by GetAction is totally random learningRateScale = 0.01 # Should be multiplied by visits_n from 13.11. trainingIterations = 20000 qlearner = QLearning.QLearning( stateSpaceShape=Assignment7Support.CartPoleStateSpaceShape(), numActions=env.action_space.n, discountRate=discountRate) for trialNumber in range(trainingIterations): observation = env.reset() reward = 0 for i in range(300): env.render() print("Iteration ", i) currentState = Assignment7Support.CartPoleObservationToStateSpace( observation) action = qlearner.GetAction( currentState, learningMode=True, randomActionRate=randomActionRate,