def __init__(self, save_name='./data/', load_name=None): super(BaseRLAgent, self).__init__() self.training = False self.max_frames = 10000000 self._epsilon = Epsilon(start=0.9, end=0.1, update_increment=0.0001) self.gamma = 0.99 self.train_q_per_step = 4 self.train_q_batch_size = 256 self.steps_before_training = 5000 self.target_q_update_frequency = 10000 self.save_name = save_name if load_name is None: self.load_name = self.save_name else: self.load_name = load_name self._Q = None self._Qt = None self._optimizer = None self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self._criterion = nn.MSELoss() self._memory = ReplayMemory(100000) self._loss = deque(maxlen=int(1e5)) self._max_q = deque(maxlen=int(1e5)) self.loss = [] self.max_q = [] self.reward = [] self._action = None self._screen = None self._screen_size = 32 self.n_episodes = 0 self.features = None
def pretrain(self, memory_pretrained_fn, batch_size=512, iterations=int(1e5)): memory_pretrained = pickle.load(open(memory_pretrained_fn, 'rb')) self._memory = ReplayMemory(len(memory_pretrained)) self._memory.memory = memory_pretrained self.train_q_batch_size = batch_size start_time = time.time() for i in range(iterations): if i % 500 == 0: if i > 1: for key in self._Qt.state_dict(): try: print( sum(self._Qt.state_dict()[key] - self._Q.state_dict()[key])) except TypeError as e: print(e) self._Qt = copy.deepcopy(self._Q) if i % 10000 == 0: torch.save(self._Q.state_dict(), f'{self.save_name}_{i}.pth') pickle.dump(self.loss, open(f'{self.save_name}_loss_{i}.pkl', 'wb')) print(f'Training iteration {i}...', ) self.train_q(squeeze=True) end_time = time.time() print(f'Training completed. Took {start_time - end_time} seconds') torch.save(self._Q.state_dict(), f'{self.save_name}_{iterations}.pth') pickle.dump(self.loss, open(f'{self.save_name}_loss_{iterations}.pkl', 'wb'))
def __init__(self, mapname): super(BattleAgentScripted, self).__init__() self.max_frames = int(1e5) self._memory = ReplayMemory(self.max_frames) self.obs = None self.features = [_PLAYER_RELATIVE, _UNIT_TYPE, _UNIT_HIT_POINTS] self.mapname = mapname self.screen_size = 32
def __init__(self, mapname): super(BattleAgentScriptedBeacon, self).__init__() self.max_frames = int(1e5) self._memory = ReplayMemory(self.max_frames) self.obs = None self.features = 5 self.mapname = mapname self.screen_size = 32
class BattleAgentPretrained(BattleAgent): def __init__(self, save_name): super(BattleAgentPretrained, self).__init__(save_name=save_name) def pretrain(self, memory_pretrained_fn, batch_size=512, iterations=int(1e5)): memory_pretrained = pickle.load(open(memory_pretrained_fn, 'rb')) self._memory = ReplayMemory(len(memory_pretrained)) self._memory.memory = memory_pretrained self.train_q_batch_size = batch_size start_time = time.time() for i in range(iterations): if i % 500 == 0: if i > 1: for key in self._Qt.state_dict(): try: print( sum(self._Qt.state_dict()[key] - self._Q.state_dict()[key])) except TypeError as e: print(e) self._Qt = copy.deepcopy(self._Q) if i % 10000 == 0: torch.save(self._Q.state_dict(), f'{self.save_name}_{i}.pth') pickle.dump(self.loss, open(f'{self.save_name}_loss_{i}.pkl', 'wb')) print(f'Training iteration {i}...', ) self.train_q(squeeze=True) end_time = time.time() print(f'Training completed. Took {start_time - end_time} seconds') torch.save(self._Q.state_dict(), f'{self.save_name}_{iterations}.pth') pickle.dump(self.loss, open(f'{self.save_name}_loss_{iterations}.pkl', 'wb')) def train_q(self, squeeze=False): if self.train_q_batch_size >= len(self._memory): return s, a, s_1, r, done = self._memory.sample(self.train_q_batch_size) s = torch.from_numpy(s).to(self.device).float() a = torch.from_numpy(a).to(self.device).long().unsqueeze(1) s_1 = torch.from_numpy(s_1).to(self.device).float() r = torch.from_numpy(r).to(self.device).float() done = torch.from_numpy(1 - done).to(self.device).float() if squeeze: s = s.squeeze() s_1 = s_1.squeeze() # Q_sa = r + gamma * max(Q_s'a') Q = self._Q(s).view(self.train_q_batch_size, -1) Q = Q.gather(1, a) Qt = self._Qt(s_1).view(self.train_q_batch_size, -1) # double Q best_action = self._Q(s_1).view(self.train_q_batch_size, -1).max(dim=1, keepdim=True)[1] y = r + done * self.gamma * Qt.gather(1, best_action) # y = r + done * self.gamma * Qt.max(dim=1)[0].unsqueeze(1) temp_state_dict = self._Q.state_dict() # with y.no_grad(): loss = self._criterion(Q, y) self.loss.append(loss.sum().cpu().data.numpy()) self._max_q.append(Q.max().cpu().data.numpy().reshape(-1)[0]) self._optimizer.zero_grad() # zero the gradient buffers loss.backward() self._optimizer.step() pass
class BattleAgentScripted(BaseAgent): def __init__(self, mapname): super(BattleAgentScripted, self).__init__() self.max_frames = int(1e5) self._memory = ReplayMemory(self.max_frames) self.obs = None self.features = [_PLAYER_RELATIVE, _UNIT_TYPE, _UNIT_HIT_POINTS] self.mapname = mapname self.screen_size = 32 def get_action(self, obs): if FUNCTIONS.Attack_screen.id in obs.observation.available_actions: player_relative = obs.observation.feature_screen.player_relative roaches = _xy_locs(player_relative == _PLAYER_ENEMY) if not roaches: return FUNCTIONS.no_op() # Find the roach with max y coord. target = roaches[np.argmax(np.array(roaches)[:, 1])] return FUNCTIONS.Attack_screen("now", target) else: return FUNCTIONS.no_op() def run_loop(self, env): """A run loop to have agents and an environment interact.""" start_time = time.time() total_frames = 0 action_spec = env.action_spec() observation_spec = env.observation_spec() self.setup(observation_spec, action_spec) try: while True: obs = env.reset()[0] # remove unit selection from the equation by selecting the entire army on every new game. select_army = actions.FunctionCall(_SELECT_ARMY, [[False]]) obs = env.step([select_army])[0] self.reset() episode_reward = 0 while True: total_frames += 1 self.obs = obs.observation["feature_screen"][self.features] s = np.expand_dims(self.obs, 0) if total_frames >= self.max_frames: pickle.dump( self._memory.memory, open( f'./data/{self.mapname}/scripted_replaymemory_res{self.screen_size}.pkl', 'wb')) print("max frames reached") return if obs.last(): break action = self.get_action(obs) if len(action[-1]) > 0: a_idx = action[-1][-1] action_indices = [a_idx[1], a_idx[0]] action_index = np.ravel_multi_index( action_indices, [self.screen_size, self.screen_size]) obs = env.step([action])[0] r = obs.reward episode_reward += r s1 = np.expand_dims( obs.observation["feature_screen"][self.features], 0) done = r > 0 if len(action[-1]) > 0: transition = Transition(s, action_index, s1, r, done) self._memory.push(transition) print(f'Total frames: {total_frames}') finally: print("finished") elapsed_time = time.time() - start_time print("Took %.3f seconds for %s steps: %.3f fps" % (elapsed_time, total_frames, total_frames / elapsed_time))
class BaseRLAgent(BaseAgent, ABC): def __init__(self, save_name='./data/', load_name=None): super(BaseRLAgent, self).__init__() self.training = False self.max_frames = 10000000 self._epsilon = Epsilon(start=0.9, end=0.1, update_increment=0.0001) self.gamma = 0.99 self.train_q_per_step = 4 self.train_q_batch_size = 256 self.steps_before_training = 5000 self.target_q_update_frequency = 10000 self.save_name = save_name if load_name is None: self.load_name = self.save_name else: self.load_name = load_name self._Q = None self._Qt = None self._optimizer = None self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self._criterion = nn.MSELoss() self._memory = ReplayMemory(100000) self._loss = deque(maxlen=int(1e5)) self._max_q = deque(maxlen=int(1e5)) self.loss = [] self.max_q = [] self.reward = [] self._action = None self._screen = None self._screen_size = 32 self.n_episodes = 0 self.features = None def initialize_model(self, model): self._Q = model self._Qt = copy.deepcopy(self._Q) self._Q.to(self.device) self._Qt.to(self.device) self._optimizer = optim.Adam(self._Q.parameters(), lr=1e-8) def load_model_checkpoint(self, load_params=True): self._Q.load_state_dict(torch.load(self.load_name + '.pth')) for key in self._Q.state_dict(): print(self._Q.state_dict()[key]) print(self._Qt.state_dict()[key]) if load_params: saved_data = pickle.load(open(f'{self.load_name}' + '_data.pkl', 'rb')) self.loss = saved_data['loss'] self.max_q = saved_data['max_q'] self._epsilon._value = saved_data['epsilon'] self.reward = saved_data['reward'] self.n_episodes = saved_data['n_episodes'] def get_env_action(self, action, obs, command=_MOVE_SCREEN): action = np.unravel_index(action, [1, self._screen_size, self._screen_size]) target = [action[2], action[1]] # command = _MOVE_SCREEN # action[0] # removing unit selection out of the equation if command in obs.observation["available_actions"]: return actions.FunctionCall(command, [[0], target]) else: return actions.FunctionCall(_NO_OP, []) def save_data(self, episodes_done=0): save_data = {'loss': self.loss, 'max_q': self.max_q, 'epsilon': self._epsilon._value, 'reward': self.reward, 'n_episodes': self.n_episodes} if episodes_done > 0: save_name = self.save_name + f'_checkpoint{episodes_done}' else: save_name = self.save_name torch.save(self._Q.state_dict(), save_name + '.pth') pickle.dump(save_data, open(f'{save_name}_data.pkl', 'wb')) def evaluate(self, env, max_episodes=10000, load_dict=True): if load_dict: self.load_model_checkpoint(load_params=False) self._epsilon.isTraining = False while True: self.run_loop(env, self.max_frames, max_episodes=max_episodes, evaluate_checkpoints=0) def train(self, env, training=True, max_episodes=10000): self._epsilon.isTraining = training self.run_loop(env, self.max_frames, max_episodes=max_episodes) if self._epsilon.isTraining: self.save_data() @abstractmethod def run_loop(self, env, max_frames, max_episodes, evaluate_checkpoints): pass def get_action(self, s, unsqueeze=True): # greedy if np.random.rand() > self._epsilon.value(): s = torch.from_numpy(s).to(self.device) if unsqueeze: s = s.unsqueeze(0).float() else: s = s.float() with torch.no_grad(): self._action = self._Q(s).squeeze().cpu().data.numpy() return self._action.argmax() # explore else: action = 0 target = np.random.randint(0, self._screen_size, size=2) return action * self._screen_size * self._screen_size + target[0] * self._screen_size + target[1] def train_q(self, squeeze=False): if self.train_q_batch_size >= len(self._memory): return s, a, s_1, r, done = self._memory.sample(self.train_q_batch_size) s = torch.from_numpy(s).to(self.device).float() a = torch.from_numpy(a).to(self.device).long().unsqueeze(1) s_1 = torch.from_numpy(s_1).to(self.device).float() r = torch.from_numpy(r).to(self.device).float() done = torch.from_numpy(1 - done).to(self.device).float() if squeeze: s = s.squeeze() s_1 = s_1.squeeze() # Q_sa = r + gamma * max(Q_s'a') Q = self._Q(s).view(self.train_q_batch_size, -1) Q = Q.gather(1, a) Qt = self._Qt(s_1).view(self.train_q_batch_size, -1) # double Q best_action = self._Q(s_1).view(self.train_q_batch_size, -1).max(dim=1, keepdim=True)[1] y = r + done * self.gamma * Qt.gather(1, best_action) # y = r + done * self.gamma * Qt.max(dim=1)[0].unsqueeze(1) # y.volatile = False # with y.no_grad(): loss = self._criterion(Q, y) self._loss.append(loss.sum().cpu().data.numpy()) self._max_q.append(Q.max().cpu().data.numpy().reshape(-1)[0]) self._optimizer.zero_grad() # zero the gradient buffers loss.backward() self._optimizer.step()