def play(self): env = gym_tetris.make('TetrisA-v0') env = JoypadSpace(env, MOVEMENT) state = env.reset() model = self.global_model model_path = os.path.join(self.save_dir, 'model_{}.h5'.format('Tetris')) print('Loading model from: {}'.format(model_path)) model.load_weights(model_path) done = False step_counter = 0 reward_sum = 0 pieza_colocada = True informacion = env.get_info() antiguo_statistics = informacion['statistics'] state = [0, 0, 0, 0] while not done: env.render() if pieza_colocada: pieza_colocada = False pos = 5 giro = 0 u = -1 state = [state] policy, value = model( tf.convert_to_tensor(state, dtype=tf.float32)) policy = tf.nn.softmax(policy) action = np.argmax(policy) pos_objetivo = action % 10 giro_objetivo = action // 10 if (giro % giro_objetivo) != 0 and not done: state, reward, done, info = env.step(1) accion = 0 giro = giro + 1 elif pos > pos_objetivo and not done: state, reward, done, info = env.step(6) pos = pos - 1 accion = 0 elif pos < pos_objetivo and not done: state, reward, done, info = env.step(3) pos = pos + 1 accion = 0 elif not done and not pieza_colocada: state, reward, done, info = env.step(9) accion = 9 else: accion = 0 if not done: state, reward, done, info = env.step(accion) env.render() informacion = env.get_info() if antiguo_statistics != informacion['statistics']: antiguo_statistics = informacion['statistics'] step_counter += 1 env.close()
episodios = 5000 # Numero de partidas a realizar max_steps = None # Numero de pasos a realizar en cada partida entrenar_cada = 1 # Numero que indica cada cuantos episodios entrenar file = open("resultados_Combine.txt", "w") agent = DQNAgent() puntuacion_max = 0 for episodio in range(0, episodios): state = env.reset() terminado = False pos = 5 piez = pieza_clase() informacion = env.get_info() antiguo_statistics = informacion['statistics'] estado = [0, 0, 0, 0] pieza_colocada = True u = -1 ant_nom_piez = '' lineas_completadas = 0 eliminado = False # Empieza un episodio for i in range(0, 100000): #Si la partida se ha acabado salimos del episodio if terminado:
class Worker(threading.Thread): global_episode = 0 global_moving_average_reward = 0 best_score = 0 save_lock = threading.Lock() def __init__(self, state_size, action_size, global_model, opt, result_queue, idx, game_name='Tetris', save_dir='/tmp'): super(Worker, self).__init__() self.state_size = state_size self.action_size = action_size self.result_queue = result_queue self.global_model = global_model self.opt = opt self.local_model = ActorCriticModel(self.state_size, self.action_size) self.worker_idx = idx self.env = gym_tetris.make('TetrisA-v0') self.env = JoypadSpace(self.env, MOVEMENT) self.save_dir = save_dir self.ep_loss = 0.0 self.game_name = 'Tetris' def run(self): total_step = 1 mem = Memory() while Worker.global_episode < episodios: self.env.reset() estado = [0., 0., 0., 0.] mem.clear() ep_reward = 0. ep_steps = 0 self.ep_loss = 0 informacion = self.env.get_info() antiguo_statistics = informacion['statistics'] time_count = 0 done = False pieza_colocada = True while not done: # Si hemos colocado la pieza calculamos la posicion y el giro de la proxima pieza if pieza_colocada: pieza_colocada = False pos = 5 giro = 1 u = -1 ant_nom_piez = '' estado = [estado] logits, _ = self.local_model( tf.convert_to_tensor(estado, dtype=tf.float32)) probs = tf.nn.softmax(logits) prob = probs[0][39] probs = np.delete(probs[0], 39) suma = np.sum(probs) probs = np.insert(probs, 39, abs(1 - suma)) action = np.random.choice(self.action_size, p=probs) pos_objetivo = action % 10 giro_objetivo = (action // 10) + 1 # Colocamos la pieza donde hemos calculado girandola y moviendola if (giro % giro_objetivo) != 0 and not done: state, reward, done, info = self.env.step(1) accion = 0 giro = giro + 1 elif pos > pos_objetivo and not done: state, reward, done, info = self.env.step(6) pos = pos - 1 accion = 0 elif pos < pos_objetivo and not done: state, reward, done, info = self.env.step(3) pos = pos + 1 accion = 0 elif not done and not pieza_colocada: state, reward, done, info = self.env.step(9) accion = 9 else: accion = 0 if not done: new_state, reward, done, info = self.env.step(accion) informacion = self.env.get_info() # Si la pieza ha sido colocada calculamos las ganancias del movimiento if antiguo_statistics != informacion['statistics']: antiguo_statistics = informacion['statistics'] ep_reward_new = informacion['score'] reward = ep_reward_new - ep_reward board = self.env.board() nuevo_estado = board_prop(board)[:] pieza_colocada = True k = 1 if nuevo_estado[0] > 18: done = True ep_reward = ep_reward_new mem.store(estado[0], action, reward) # Calculamos el gradiente local usando la perdida calculada de nuestra partida actual y # nuestro modelo if time_count == 10 or done: with tf.GradientTape() as tape: total_loss = self.compute_loss( done, nuevo_estado, mem, 0.99) self.ep_loss += total_loss grads = tape.gradient( total_loss, self.local_model.trainable_weights) self.opt.apply_gradients( zip(grads, self.global_model.trainable_weights)) self.local_model.set_weights( self.global_model.get_weights()) mem.clear() time_count = 0 if done: Worker.global_moving_average_reward = \ record(Worker.global_episode, ep_reward, self.worker_idx, Worker.global_moving_average_reward, self.result_queue, self.ep_loss, ep_steps) if ep_reward > Worker.best_score: with Worker.save_lock: self.global_model.save_weights( os.path.join( self.save_dir, 'model_{}.h5'.format( self.game_name))) Worker.best_score = ep_reward Worker.global_episode += 1 ep_steps += 1 time_count += 1 estado = nuevo_estado total_step += 1 self.result_queue.put(None) # Calculamos la perdida def compute_loss(self, done, nuevo_estado, memory, gamma=0.99): if done: reward_sum = 0. # terminal else: nuevo_estado = [nuevo_estado] reward_sum = self.local_model( tf.convert_to_tensor(nuevo_estado, dtype=tf.float32))[-1].numpy()[0] discounted_rewards = [] for reward in memory.rewards[::-1]: reward_sum = reward + gamma * reward_sum discounted_rewards.append(reward_sum) discounted_rewards.reverse() logits, values = self.local_model( tf.convert_to_tensor(np.vstack(memory.states), dtype=tf.float32)) advantage = tf.convert_to_tensor(np.array(discounted_rewards)[:, None], dtype=tf.float32) - values value_loss = advantage**2 policy = tf.nn.softmax(logits) entropy = tf.nn.softmax_cross_entropy_with_logits_v2(labels=policy, logits=logits) policy_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=memory.actions, logits=logits) policy_loss *= tf.stop_gradient(advantage) policy_loss -= 0.01 * entropy total_loss = tf.reduce_mean((0.5 * value_loss + policy_loss)) return total_loss