def __init__(self, lr=0.01, gamma=0.9, batch_size=100, eps_start=1, eps_end=0, eps_test=0, target_model_update=1000, seq_memory_limit=50000, epsilon_decay=1, comment=""): # hyperparameters: self.LEARNING_RATE = lr # default = 0.001 -> higher LR is faster learning but can become unstable and local minimum self.GAMMA = gamma # gamma defines penalty for future reward self.BATCH_SIZE = batch_size # default = 32 -> too small for tetris? self.EPSILON_START = eps_start self.EPSILON_END = eps_end self.EPSILON_DECAY = epsilon_decay # after how many steps, epsilon = epsilon end self.TARGET_MODEL_UPDATE = target_model_update # default is 10000 self.EPSILON_TEST = eps_test self.SEQUENTIAL_MEMORY_LIMIT = seq_memory_limit self.TEST_MAX_EPISODE_STEPS = 10000 self.TRAIN_MAX_EPISODE_STEPS = 10000 self.MAX_STEP_SCORE = 500 # score if max episode steps are reached self.DYING_PEN = 50 # comment for plots self.COMMENT = comment # Initializes a Tetris playing field of width 10 and height 20. self.env = TetrisEngine(dying_pen=self.DYING_PEN, max_steps=self.TRAIN_MAX_EPISODE_STEPS, max_step_score=self.MAX_STEP_SCORE) self.agent = None
def __init__(self): self.game_state = GameState(TetrisBoard()) #from blocks import BlockLine, BlockRightL, BlockCube #l = BlockLine() #l.rotate() #r = BlockRightL() #r.rotate(-1) #c = BlockCube() #self.game_state.board.place_block(l, (-1,0)) #self.game_state.board.place_block(r, (1,0)) #self.game_state.board.place_block(c, (8,0)) self.engine = TetrisEngine(self.game_state) self.ai = TetrisAI(self.engine)
class BasicAgent(): def __init__(self): # Initializes a Tetris playing field of width 10 and height 20. self.env = TetrisEngine() def run(self): # Loop to keep playing games while True: # Variable to indicate whether the game has ended or not done = False # Resets the environment state = self.env.reset() # Loop that keeps making moves as long as the game hasn't ended yet while not done: # Picks a random action action = random.randint(0, 5) action = 5 # Performs the action in the game engine next_state, reward, done, info = self.env.step(action) # Render the game state self.env.render() # Sleep to make sure a human can follow the gameplay sleep(0.05)
def setup(self): # Initialization for i in range(self.player_num): self.engines[i] = TetrisEngine(self.width, self.height) self.engines[i].clear() if self.use_gui: gui = GUI(self, self.block_size) self.gui = gui else: self.stdscr = curses.initscr() curses.noecho() # Store play information self.dbs = {} self.done = False for i in range(self.player_num): # Initial rendering self.engine_states[i] = { "KO": 0, "reward": 0, "lines_sent": 0, "lines_cleared": 0, "hold_shape": None, "hold_shape_name": None, "hold_locked": False, "garbage_lines": 0, "highest_line": 0, "combo": -1 } # Initialize dbs self.dbs[i] = [] self.game_count += 1 self.start_time = time.time()
curses.noecho() # React to keys without pressing enter (700ms delay) curses.halfdelay(7) # Enumerate keys stdscr.keypad(True) # return stdscr if __name__ == '__main__': # Curses standard screen stdscr = curses.initscr() # Init environment width, height = 10, 20 # standard tetris friends rules env = TetrisEngine(width, height) # Play games on repeat while True: init() stdscr.clear() env.clear() db = play_game() # Return to terminal terminate() # Should the game info be saved? if save_game(): try: fr = open('training_data.npy', 'rb') x = np.load(fr)
def main(episode, load, learn, debug, random_rate, session): load_model = load print("load model", load_model, "learn", learn, "debug", debug, "episode", episode) width, height = 7, 14 # standard tetris friends rules env = TetrisEngine(width, height) action_count = 7 agent = Agent(lr=1e-4, input_dims=width * height, gamma=0.5, n_actions=action_count, l1_size=512, l2_size=128) if session: model_filename = "%s-trained_model.torch" % session else: model_filename = "trained_model.torch" parameter_size = sum([len(p) for p in agent.policy.parameters()]) print("network parameter size:", parameter_size) action_idx = 0 if load_model: agent.policy.load_state_dict(T.load(model_filename)) for i in range(episode): done = False score = 0 state = env.clear() counter = 0 while not done: counter += 1 action, probs = agent.choose_action(state) prob = probs[action].item() state, reward, done = env.step(action) agent.store_rewards(reward) score += reward if debug: stdscr = curses.initscr() stdscr.clear() stdscr.addstr(str(env)) stdscr.addstr('\ncumulative reward: ' + str(score)) stdscr.addstr('\nreward: ' + str(reward)) time.sleep(.2) continue if not debug and i % 100 == 0 and counter % 100 == 1: idx2direction = { 0: "left", 1: "right", 2: "hard_drop", 3: "soft_drop", 4: "rotate_left", 5: "rotate_right", 6: "idle" } probs_str = "" for z, item in enumerate(probs): probs_str += "%s:%0.2f, " % (idx2direction[z], item.item()) print(probs_str) print('episode: ', i, 'counter: ', counter, 'reward %0.3f' % reward, 'action: %s (%0.2f)' % (action, prob)) writer.add_scalar("action prob", prob, action_idx) action_idx += 1 if not debug and i % 100 == 0: print('episode: ', i, 'score %0.3f' % score) writer.add_scalar("final score", score, i) if learn: agent.learn() if i % 1000 == 0: T.save(agent.policy.state_dict(), model_filename) writer.close()
import sys import os import shutil from collections import namedtuple from itertools import count import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from torch.autograd import Variable from engine import TetrisEngine width, height = 10, 20 # standard tetris friends rules engine = TetrisEngine(width, height) # if gpu is to be used use_cuda = torch.cuda.is_available() if use_cuda: print("....Using Gpu...") FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor # Tensor = FloatTensor ###################################################################### # Replay Memory # ------------- # - ``Transition`` - a named tuple representing a single transition in # our environment
] for pre_actions in possible_pre_actions: shape, anchor = engine.shape, engine.anchor # Applies the pre-actions. for a in pre_actions: shape, anchor = engine.actions[a](shape, anchor, board) shape, anchor = engine.actions.soft_drop(shape, anchor, board) # Tests the best sequence of post-actions. for action in [engine.actions.LEFT, engine.actions.RIGHT]: new_actions, new_score = compute_helper(engine, shape, anchor, action) if new_score < min_score: actions = pre_actions + new_actions min_score = new_score return actions if __name__ == '__main__': engine = TetrisEngine(width=10, height=20) steps = compute_optimal_steps(engine) while True: steps = compute_optimal_steps(engine) for step in steps: engine.step(step) print(engine) time.sleep(0.05)
def __init__(self): # Initializes a Tetris playing field of width 10 and height 20. self.env = TetrisEngine()
# -*- coding: utf-8 -*- from itertools import count import numpy as np from engine import TetrisEngine, board_to_bool from heuristic import heuristic_fn, complete_line width, height = 10, 20 # standard tetris friends rules engine = TetrisEngine(width, height, enable_KO=False) class FixedPolicyAgent: def __init__(self): self.current_actions = [] def get_action(self, engine, shape, anchor, board): if len(self.current_actions) == 0: _, _, self.current_actions = self.select_action( engine, shape, anchor, board) action = self.current_actions.pop(0) return action def select_action(self, engine, shape, anchor, board): actions_name_final_location_map = engine.get_valid_final_states( shape, anchor, board) act_pairs = [(k, v[2], v[3]) for k, v in actions_name_final_location_map.items()] placements = [board_to_bool(p) for k, p, actions in act_pairs] h_score = [heuristic_fn(s, complete_line(s)) for s in placements] act_idx = np.argmax(h_score) actions_name, final_placement, actions = act_pairs[act_idx]
'--epoch-len', help='Number of training epochs', metavar='E', type=int, default=1000) parser.add_argument('-n', '--num-engines', help='Number of simultaneous training engines', metavar='N', type=int, default=100) args = parser.parse_args() # 엔진을 초기화 합니다. engines = [ TetrisEngine(args.width, args.length) for _ in range(args.num_engines) ] # 다양한 모델을 빌드합니다. train_model, sample_model = build_models( args.width, args.length, len(engines[0].shapes), len(engines[0].actions), ) # 존재하는 경우, 기존 가중치를 로드합니다. if os.path.exists(args.model_save_loc): try: train_model.load_weights(args.model_save_loc) except:
for pre_actions in possible_pre_actions: shape, anchor = engine.shape, engine.anchor # Applies the pre-actions. for a in pre_actions: shape, anchor = engine.actions[a](shape, anchor, board) shape, anchor = engine.actions.soft_drop(shape, anchor, board) # Tests the best sequence of post-actions. for action in [engine.actions.LEFT, engine.actions.RIGHT]: new_actions, new_score = compute_helper(engine, shape, anchor, action) if new_score < min_score: actions = pre_actions + new_actions min_score = new_score return actions if __name__ == '__main__': engine = TetrisEngine(width=10, height=20) steps = compute_optimal_steps(engine) while True: steps = compute_optimal_steps(engine) for step in steps: engine.step(step) print(engine) time.sleep(0.05)
def play_game_with_gen(dict_genes, engine): engine.clear() sl = 0 for t in count(): actions_name, placement, actions = genetic_agent.select_action( engine, engine.shape, engine.anchor, engine.board, dict_genes) # Observations state, reward, done, cleared_lines, sent_lines = engine.step_to_final( actions) # Perform one step of the optimization (on the target network) sl += sent_lines logger.info(engine) logger.info(f"Sent lines: {sl}") time.sleep(.1) if done: break logger.info("") logger.info("") logger.info("") if __name__ == '__main__': engine = TetrisEngine(width, height, enable_KO=False) darwin = GeneticAlgorithm(population_size=50, mutation_rate=0.05, num_generations=30, engine=engine) darwin.evolve_the_beasts()
class Agent: def __init__(self, lr=0.01, gamma=0.9, batch_size=100, eps_start=1, eps_end=0, eps_test=0, target_model_update=1000, seq_memory_limit=50000, epsilon_decay=1, comment=""): # hyperparameters: self.LEARNING_RATE = lr # default = 0.001 -> higher LR is faster learning but can become unstable and local minimum self.GAMMA = gamma # gamma defines penalty for future reward self.BATCH_SIZE = batch_size # default = 32 -> too small for tetris? self.EPSILON_START = eps_start self.EPSILON_END = eps_end self.EPSILON_DECAY = epsilon_decay # after how many steps, epsilon = epsilon end self.TARGET_MODEL_UPDATE = target_model_update # default is 10000 self.EPSILON_TEST = eps_test self.SEQUENTIAL_MEMORY_LIMIT = seq_memory_limit self.TEST_MAX_EPISODE_STEPS = 10000 self.TRAIN_MAX_EPISODE_STEPS = 10000 self.MAX_STEP_SCORE = 500 # score if max episode steps are reached self.DYING_PEN = 50 # comment for plots self.COMMENT = comment # Initializes a Tetris playing field of width 10 and height 20. self.env = TetrisEngine(dying_pen=self.DYING_PEN, max_steps=self.TRAIN_MAX_EPISODE_STEPS, max_step_score=self.MAX_STEP_SCORE) self.agent = None # target model update in source code: # if self.target_model_update >= 1 and self.step % self.target_model_update == 0: # -> I think that the total steps have to be multiple of target_model_update to work @timer def train(self, nb_steps=1000, visualise=True): """ the training process of the deep Q agent """ # Resets the environment self.env.reset_environment() # init Neural network actions = 6 # there are 6 discrete actions model = self.build_model_conv(actions) model.summary() # define callbacks callbacks = build_callbacks() # init and fit the agent dqn = self.build_agent(model, actions, nb_steps) dqn.compile(Adam(lr=self.LEARNING_RATE), metrics=['mae', 'mse']) history_training = dqn.fit( self.env, nb_steps=nb_steps, callbacks=callbacks, visualize=visualise, log_interval=self.TARGET_MODEL_UPDATE, verbose=1, nb_max_episode_steps=self.TRAIN_MAX_EPISODE_STEPS) # plot the results self._plot_custom_results(self.env.df_info, history_training, mode='training') # save trained agent self.agent = dqn return dqn @timer def test(self, nb_episodes=10, visualize=True): """ The testing process of the deep q agent """ self.env.reset_environment() history_test = self.agent.test( self.env, nb_episodes=nb_episodes, visualize=visualize, nb_max_episode_steps=self.TEST_MAX_EPISODE_STEPS) print(np.mean(history_test.history['episode_reward'])) # plot the results self._plot_custom_results(self.env.df_info, history_test, mode='test') def save(self, name): """ saving the model weights for future use """ self.agent.save_weights(f'models/{name}.model', overwrite=False) def build_model_conv(self, actions): """ define the neural network model architecture for the deep q agent """ model = tf.keras.models.Sequential() model.add( Conv2D(32, (2, 2), padding='same', kernel_initializer='he_uniform', kernel_constraint=max_norm(3), input_shape=(1, self.env.height, self.env.width))) model.add(BatchNormalization()) model.add(Activation('tanh')) model.add( Conv2D(64, (2, 2), padding='same', kernel_initializer='he_uniform', kernel_constraint=max_norm(3))) model.add(BatchNormalization()) model.add(Activation('tanh')) model.add( Conv2D(64, (2, 2), padding='same', kernel_initializer='he_uniform', kernel_constraint=max_norm(3))) model.add(BatchNormalization()) model.add(Activation('tanh')) # model.add(MaxPooling2D(pool_size=(2,2))) # end of convolutional layers, start of 'hidden' dense layers model.add(Flatten()) model.add( Dense(128, kernel_initializer='he_uniform', kernel_constraint=max_norm(3))) model.add(BatchNormalization()) model.add(Activation('tanh')) model.add(Dropout(0.5)) # Final dense layer model.add(Dense(actions)) model.add(BatchNormalization()) model.add(Activation('linear')) return model def build_agent(self, model, actions, nb_steps): """ building the deep q agent GAMMA: REWARD = r1 + gamma*r2 + gamma^2*r3 + gamma^3*r4 ... -> gamma defines penalty for future reward In general, most algorithms learn faster when they don't have to look too far into the future. So, it sometimes helps the performance to set gamma relatively low. for many problems a gamma of 0.9 or 0.95 is fine LAMBDA: The lambda parameter determines how much you bootstrap on earlier learned value versus using the current Monte Carlo roll-out. This implies a trade-off between more bias (low lambda) and more variance (high lambda). A general rule of thumb is to use a lambda equal to 0.9. However, it might be good just to try a few settings (e.g., 0, 0.5, 0.8, 0.9, 0.95 and 1.0) """ policy = LinearAnnealedPolicy( EpsGreedyQPolicy( ), # takes current best action with prob (1 - epsilon) attr='eps', # decay epsilon (=exploration) per agent step value_max=self. EPSILON_START, # start value of epsilon (default =1) value_min=self.EPSILON_END, # last value of epsilon (default =0 value_test=self.EPSILON_TEST, nb_steps=self.EPSILON_DECAY * nb_steps) memory = SequentialMemory(limit=self.SEQUENTIAL_MEMORY_LIMIT, window_length=1) build_agent = DQNAgent(model=model, memory=memory, policy=policy, gamma=self.GAMMA, batch_size=self.BATCH_SIZE, nb_actions=actions, nb_steps_warmup=1000, target_model_update=self.TARGET_MODEL_UPDATE, enable_double_dqn=False, train_interval=4) return build_agent def _plot_custom_results(self, df, history, mode='training'): """ plot custom results """ # input data if 'new_episode' not in df: raise KeyError( 'the dataframe has to have the new_episode column to plot the results' ) df["nr_episode"] = df["new_episode"].cumsum() df_results = df.groupby('nr_episode', as_index=False) \ .agg(heigt_diff_sum=('height_difference', 'sum'), new_block_sum=('new_block', 'sum'), nr_lines_sum=('number_of_lines', 'max'), score_sum=('score', 'sum'), score_avg=('score', 'mean'), count_steps=('nr_episode', 'count')) df_results['moving_average_score'] = df_results.score_sum.expanding( ).mean() df_results['moving_average_lines'] = df_results.nr_lines_sum.expanding( ).mean() # init plot figure = pyplot.figure(figsize=(20, 10), dpi=80) figure.canvas.set_window_title(mode) # PLOT 1: EPISODE REWARD pyplot.subplot(221) # data (the dict keys are different for training and test) if mode == 'training': episode_key = 'nb_episode_steps' else: episode_key = 'nb_steps' y_1 = history.history[episode_key] y_2 = history.history['episode_reward'] ind = np.arange(len(y_1)) # bars width = 0.35 # the width of the bars pyplot.bar(ind, y_1, width, color='g', label='nb_episode_steps') pyplot.ylabel('nr steps per episode') pyplot.xlabel('episode') pyplot.legend(loc="upper left") # line axes2 = pyplot.twinx() axes2.plot(ind, y_2, color='k', label='episode_reward') axes2.set_ylabel('episode reward') pyplot.legend(loc="upper right") # title pyplot.title(mode + ': episode reward and steps per episode') # PLOT 2: NR OF LINES CLEARED PER EPISODE pyplot.subplot(222) x = df_results['nr_episode'] y = df_results['nr_lines_sum'] # plotting the points pyplot.plot(x, y) # naming the x axis pyplot.xlabel('episodes') # naming the y axis pyplot.ylabel('nr_of_lines') # title pyplot.title(mode + ': number of lines per episode') # save the plots timestr = time.strftime("%m%d_%H%M%S") pyplot.savefig("logs/img_info_" + timestr) # PLOT 3: MOVING AVERAGE TOTAL SCORE pyplot.subplot(223) x = df_results['nr_episode'] y = df_results['moving_average_score'] # plotting the points pyplot.plot(x, y) # naming the x axis pyplot.xlabel('episodes') # naming the y axis pyplot.ylabel('moving average total score') # title pyplot.title(mode + ': moving average total score') # PLOT 4: MOVING AVERAGE LINES CLEARED pyplot.subplot(224) x = df_results['nr_episode'] y = df_results['moving_average_lines'] # plotting the points pyplot.plot(x, y) # naming the x axis pyplot.xlabel('episodes') # naming the y axis pyplot.ylabel('moving average total score') # add subtitle with hyperparams subtitile = f"Epsilon start: {self.EPSILON_START}, Epsilon end: {self.EPSILON_END}, Gamma: {self.GAMMA}, LR: {self.LEARNING_RATE}, " \ f"target model update: {self.TARGET_MODEL_UPDATE}, Batch size: {self.BATCH_SIZE}, comment: {self.COMMENT}" pyplot.figtext(0.01, 0.01, subtitile, fontsize=15) # title pyplot.title(mode + ': moving average nr of lines') # save the plots timestr = time.strftime("%m%d_%H%M%S") pyplot.savefig("logs/img_info_" + timestr) # show the plots pyplot.show() pyplot.close() def plot_metrics(self, save_fig=False): """ plot the callback metrics """ # plot the logs with open('dqn_log.json') as json_file: data = json.load(json_file) df_log = pd.DataFrame.from_dict(data) figure = pyplot.figure(figsize=(20, 10), dpi=80) for idx, col in enumerate(df_log.columns): self._combine_metrics(df_log, col, idx) # add subtitle subtitle = f"Epsilon start: {self.EPSILON_START}, Epsilon end: {self.EPSILON_END}, Gamma: {self.GAMMA}, LR: {self.LEARNING_RATE}, " \ f"target model update: {self.TARGET_MODEL_UPDATE}, Batch size: {self.BATCH_SIZE}, comment: {self.COMMENT}" pyplot.figtext(0.01, 0.01, subtitle, fontsize=15) # save fig timestr = time.strftime("%m%d_%H%M%S") if save_fig: pyplot.savefig("logs/img_logs_" + timestr) pyplot.show() @staticmethod def _combine_metrics(df, key, index): """ helper method for the plot_metrics function """ pyplot.subplot(4, 3, index + 1) pyplot.subplots_adjust(hspace=0.5) y = df[key] x = df['episode'] # plotting the points pyplot.plot(x, y) # naming the x axis pyplot.xlabel('episode nr') # naming the y axis pyplot.ylabel(key.replace('_', ' ')) # title pyplot.title(key.replace('_', ' '))
# -*- coding: utf-8 -*- import sys import os import numpy as np from collections import namedtuple from itertools import count import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from collections import deque from engine import TetrisEngine width, height = 10, 20 # standard tetris friends rules engine = TetrisEngine(width, height, enable_KO=False) eps = 10.**-8 use_cuda = torch.cuda.is_available() if use_cuda: print("....Using Gpu...") FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor Transition = namedtuple( 'Transition', ('state', 'action', 'shape', 'anchor', 'board', 'reward')) class CNN_lay(nn.Module): def __init__(self): super(CNN_lay, self).__init__()
class TetrisGame: def __init__(self): self.game_state = GameState(TetrisBoard()) #from blocks import BlockLine, BlockRightL, BlockCube #l = BlockLine() #l.rotate() #r = BlockRightL() #r.rotate(-1) #c = BlockCube() #self.game_state.board.place_block(l, (-1,0)) #self.game_state.board.place_block(r, (1,0)) #self.game_state.board.place_block(c, (8,0)) self.engine = TetrisEngine(self.game_state) self.ai = TetrisAI(self.engine) def run_main(self): self.engine.start() while not self.engine.running(): pass self.ai.play() while self.engine.running(): try: c = getch() if c == LEFT_KEY: self.engine.move_left() if c == RIGHT_KEY: self.engine.move_right() if c == DOWN_KEY: self.engine.move_down() if c == DROP_KEY: self.engine.drop_block() if c == UP_KEY: self.engine.rotate() except KeyboardInterrupt: self.engine.stop()
import sys import os import torch import time from engine import TetrisEngine from dqn_agent import DQN, ReplayMemory, Transition from torch.autograd import Variable use_cuda = torch.cuda.is_available() FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor width, height = 10, 20 # standard tetris friends rules engine = TetrisEngine(width, height) def load_model(filename): model = DQN() if use_cuda: model.cuda() checkpoint = torch.load(filename) model.load_state_dict(checkpoint['state_dict']) return model def run(model): state = FloatTensor(engine.clear()[None,None,:,:]) score = 0 while True: action = model(Variable(state, volatile=True).type(FloatTensor)).data.max(1)[1].view(1,1).type(LongTensor)