def test_config_file_exist( exp_name: str, file_path: str, subclss_tknzr: BaseTknzr, ): r"""Save configuration as file.""" subclss_tknzr.save(exp_name) assert os.path.exists(file_path)
def test_config_file_format( exp_name: str, file_path: str, subclss_tknzr: BaseTknzr, ): r"""Save configuration must be JSON format.""" subclss_tknzr.save(exp_name) with open(file_path, 'r', encoding='utf-8') as input_file: # Raise error if not valid JSON. assert json.load(input_file)
def test_load_result( exp_name: str, subclss_tknzr: BaseTknzr, subclss_tknzr_clss: Type[BaseTknzr], ): r"""Ensure configuration consistency between save and load.""" subclss_tknzr.save(exp_name) load_tknzr = subclss_tknzr_clss.load(exp_name) assert subclss_tknzr.is_uncased == load_tknzr.is_uncased assert subclss_tknzr.id2tk == load_tknzr.id2tk assert subclss_tknzr.max_vocab == load_tknzr.max_vocab assert subclss_tknzr.min_count == load_tknzr.min_count assert subclss_tknzr.tk2id == load_tknzr.tk2id
def gen(self, model: BaseModel, tknzr: BaseTknzr, txt: str) -> str: """Generate continual text conditioned on given text segment. Top-P inference algorithm is structured as follow: #. Encode input text as 1 sample batch. #. Remove token ids after ``[eos]`` since model is not trained to predict tokens after seeing ``[eos]``. #. Loop over conditioned token ids to generate conditioned hidden states. #. Loop to generate token ids. In each iteration, generated token id was choosed so that it is one of the top-k highest probabilities from next token id prediction probability distribution, where :math:`k` is the number of token ids whose cumulative probability (after sorting probability in desending order) is less than or equal to ``self.p``. Generating loop will stop early if ``[eos]`` is generated, otherwise generating loop only stop when maximum length constraint enforced by ``self.max_seq_len`` is violated. #. Decode generated token ids into text and return. Parameters ---------- model: lmp.model.BaseModel Pre-trained language model which will be used to generate text. tknzr: lmp.tknzr.BaseTknzr Pre-trained tokenizer which perform text encoding and decoding. txt: str Text segment which the generation process is conditioned on. Returns ------- str Generated text. """ # Get model running device. device = next(model.parameters()).device # Encode as 1 sample batch. We convert token ids to tensor and move tensor to the same running device as model. # shape: (1, max_seq_len). batch_cur_tkids = torch.LongTensor( tknzr.batch_enc(batch_txt=[txt], max_seq_len=self.max_seq_len)).to(device) # Remove token ids after `[eos]` since model is not trained to predict tokens after seeing `[eos]`. mask = (batch_cur_tkids == EOS_TKID) | (batch_cur_tkids == PAD_TKID) seq_len = batch_cur_tkids.size(1) - mask.sum() batch_cur_tkids = batch_cur_tkids[:, :seq_len] # Loop over conditioned token ids to generate conditioned hidden states. batch_prev_states = None for i in range(seq_len - 1): _, batch_prev_states = model.pred( batch_cur_tkids=batch_cur_tkids[:, i], batch_prev_states=batch_prev_states) # Calculate how many token at most can be generated. out_seq_len = self.max_seq_len - seq_len + 1 # Generate token ids. batch_cur_tkids = batch_cur_tkids[:, -1] gen_tkids: List[int] = [] for _ in range(out_seq_len): # Get next token id prediction probability distribution. # shape: (1, vocab_size) batch_next_tkids_pd, batch_prev_states = model.pred( batch_cur_tkids=batch_cur_tkids, batch_prev_states=batch_prev_states, ) # Sort the probability distribution in descending order. # shape: (1, vocab_size). batch_next_tkids_sort_pd, batch_next_tkids_sort = batch_next_tkids_pd.sort( dim=1, descending=True) # Calculate cumulative probability distribution and retrieve indices which cumulative probability are smaller # than threshold `self.p`. k = int((batch_next_tkids_sort_pd.cumsum(dim=1) <= self.p).sum().item()) # Sometimes the highest probability is larger than `self.p`, which means model is highly confident on predicting # next token id. Thus the above calculation will result in `k == 0`. In that case we only choose the token id # with the highest probability, we do this by setting `k = 1`. if k == 0: k = 1 # The previous `k` token ids in `batch_next_tkids_sort` have cumulative probability less than or equal to # `self.p`. We fetch them and perform further sampling. # shape: (1, k) batch_next_tkids_sort_pd = batch_next_tkids_sort_pd[:, :k] batch_next_tkids_sort = batch_next_tkids_sort[:, :k] # Use the top-k highest probabilities to construct multinomial distribution. Then sample token id from # multinomial distribution as the next token id prediction result. # `batch_next_tkids_topk_sample` shape: (1, 1). batch_next_tkids_topk_sample = torch.multinomial( batch_next_tkids_sort_pd, num_samples=1) # Use sampled result to fetch next token id prediction. # shape: (1). batch_next_tkids = torch.gather( input=batch_next_tkids_sort, dim=1, index=batch_next_tkids_topk_sample, ).squeeze(1) gen_tkid = int(batch_next_tkids.item()) gen_tkids.append(gen_tkid) # Update input token ids. batch_cur_tkids = batch_next_tkids # If the prediction token id is `[eos]`, then stop generation immediately. if gen_tkid == EOS_TKID: break # Output generated text. return tknzr.batch_dec(batch_tkids=[gen_tkids], rm_sp_tks=True)[0]
def gen(self, model: BaseModel, tknzr: BaseTknzr, txt: str) -> str: """Generate continual text conditioned on given text segment. Top-K inference algorithm is structured as follow: #. Encode input text as 1 sample batch. #. Remove token ids after ``[eos]`` since model is not trained to predict tokens after seeing ``[eos]``. #. Loop over conditioned token ids to generate conditioned hidden states. #. Loop to generate token ids. In each iteration, generated token id was choosed so that it is one of the top-K highest probabilities from next token id prediction probability distribution. Generating loop will stop early if ``[eos]`` is generated, otherwise generating loop only stop when maximum length constraint enforced by ``self.max_seq_len`` is violated. #. Decode generated token ids into text and return. Parameters ---------- model: lmp.model.BaseModel Pre-trained language model which will be used to generate text. tknzr: lmp.tknzr.BaseTknzr Pre-trained tokenizer which perform text encoding and decoding. txt: str Text segment which the generation process is conditioned on. Returns ------- str Generated text. """ # Get model running device. device = next(model.parameters()).device # Encode as 1 sample batch. We convert token ids to tensor and move tensor to the same running device as model. # shape: (1, max_seq_len). batch_cur_tkids = torch.LongTensor( tknzr.batch_enc(batch_txt=[txt], max_seq_len=self.max_seq_len)).to(device) # Remove token ids after `[eos]` since model is not trained to predict tokens after seeing `[eos]`. mask = (batch_cur_tkids == EOS_TKID) | (batch_cur_tkids == PAD_TKID) seq_len = batch_cur_tkids.size(1) - mask.sum() batch_cur_tkids = batch_cur_tkids[:, :seq_len] # Loop over conditioned token ids to generate conditioned hidden states. batch_prev_states = None for i in range(seq_len - 1): _, batch_prev_states = model.pred( batch_cur_tkids=batch_cur_tkids[:, i], batch_prev_states=batch_prev_states) # Calculate how many token at most can be generated. out_seq_len = self.max_seq_len - seq_len + 1 # Generate token ids. batch_cur_tkids = batch_cur_tkids[:, -1] gen_tkids: List[int] = [] for _ in range(out_seq_len): # Get next token id prediction probability distribution. # shape: (1, vocab_size) batch_next_tkids_pd, batch_prev_states = model.pred( batch_cur_tkids=batch_cur_tkids, batch_prev_states=batch_prev_states, ) # Get top-K highest probabilities from next token id prediction probability distribution. # shape: (1, k). batch_next_tkids_topk_p, batch_next_tkids_topk = batch_next_tkids_pd.topk( k=self.k, dim=-1) # Use the top-K highest probabilities to construct multinomial distribution. Then sample token id from # multinomial distribution as the next token id prediction result. # `batch_next_tkids_topk_sample` shape: (1, 1). batch_next_tkids_topk_sample = torch.multinomial( batch_next_tkids_topk_p, num_samples=1) # Use sampled result to fetch next token id prediction. # shape: (1). batch_next_tkids = torch.gather( input=batch_next_tkids_topk, dim=1, index=batch_next_tkids_topk_sample, ).squeeze(1) gen_tkid = int(batch_next_tkids.item()) gen_tkids.append(gen_tkid) # Update input token ids. batch_cur_tkids = batch_next_tkids # If the prediction token id is `[eos]`, then stop generation immediately. if gen_tkid == EOS_TKID: break # Output generated text. return tknzr.batch_dec(batch_tkids=[gen_tkids], rm_sp_tks=True)[0]
def test_arguments() -> None: """Must have correct arguments.""" parser = argparse.ArgumentParser() BaseTknzr.add_CLI_args(parser=parser) assert parser.parse_args([]) == argparse.Namespace()
def test_lower_case(subclss_tknzr: BaseTknzr, case_txt: Dict[str, str]): r"""Test output text is convert to lower case.""" if subclss_tknzr.is_uncased: assert subclss_tknzr.norm(case_txt['input']) == case_txt['output'] else: assert subclss_tknzr.norm(case_txt['input']) == case_txt['input']
def test_strip_whitespace(subclss_tknzr: BaseTknzr, htws_txt: Dict[str, str]): r"""Test output text is stripped.""" assert subclss_tknzr.norm(htws_txt['input']) == htws_txt['output']
def test_collapse_whitespace(subclss_tknzr: BaseTknzr, cws_txt: Dict[str, str]): r"""Test output text collapse whitespaces.""" assert subclss_tknzr.norm(cws_txt['input']) == cws_txt['output']
def test_nfkc(subclss_tknzr: BaseTknzr, non_nfkc_txt: Dict[str, str]): r"""Test output text is normalized with NFKC.""" assert subclss_tknzr.norm(non_nfkc_txt['input']) == non_nfkc_txt['output']