def acts(self, states): obs = torch.cat( tuple(Translator.encode_board(s).unsqueeze(0) for s in states), 0) valids = [ torch.tensor(list( map(Translator.encode_move_idx, s.micro_legal_moves)), dtype=torch.long) for s in states ] policy = self.net.forward(obs)[0] prob = [policy[i, valids[i]] for i in range(len(states))] prob = [prob[i] / prob[i].sum() for i in range(len(states))] idxs = [ valids[i][dist.Categorical(prob[i]).sample()] for i in range(len(states)) ] return [ Translator.decode_move(idxs[i].item(), states[i]) for i in range(len(states)) ]
def extract_episodes(self): collected = collect_episodes( ReinforcementAI(self.judge), episode_cnt, episode_length) episodes = [] for c in collected: obs = torch.cat( tuple(Translator.encode_board(b).unsqueeze(0) for b in c['boards']), 0) idx = torch.tensor( tuple(Translator.encode_move_idx(m) for m in c['moves']), dtype=torch.long) prob = self.judge.forward(obs)[0][ torch.arange(idx.size()[0], dtype=torch.long), idx] ext = c['extrinsic'] episodes += [ {'obs': obs, 'idx': idx, 'prob': prob, 'ext': ext, 'over': c['boards'][-1].is_game_over()}] return episodes