def perplexity_eval(device: torch.device, model: lmp.model.BaseRNNModel, sequence: str, tokenizer: lmp.tokenizer.BaseTokenizer) -> float: r"""Helper function for calculating perplexity. Args: device: Model running device. model: Language model. sequence: Sequence for evaluation. tokenizer: Tokenizer for encoding sequence. Return: Perplexity of `sequence`. """ # Evalation mode. model.eval() # Encode sequence and convert into tensor. Original sequence length: S. # New sequence length: S + 2. sequence = tokenizer.encode(sequence, max_seq_len=-1) # `sequence[:-2]` means predict tokens include [BOS] output but exclude # [EOS] input. `x.shape = (S)`. x = torch.LongTensor(sequence[:-2]).to(device) # `y.shape = (S)`. y = sequence[1:-1] # Reshape into `(1, S)` to fit model. x = x.reshape(1, -1) # Get model vocabulary prediction with shape `(1, S, V)`. pred_y = model.predict(x) # Reshape into `(S)` for easier maniplation. x = x.squeeze(0) # Reshape into `(S, V)` for easier maniplation. pred_y = pred_y.squeeze(0) # Accumulate negative log-likelihood. nll = torch.zeros(1).to(device) # Iterate through each prediction. for pos, token_id in enumerate(y): probs = pred_y[pos, token_id] nll = nll - torch.log(probs) # Normalized by length. nll = nll / x.size(0) # Take exponential to cancel logarithmic. return nll.exp().item()
def main() -> None: r"""Script entry point.""" # Parse command-line argument. args = parse_arg() # Set random seed for reproducibility. lmp.util.rand.set_seed(seed=args.seed) # Load pre-trained model configuration. model_cfg = lmp.util.cfg.load(exp_name=args.exp_name) # Load pre-trained tokenizer configuration. tknzr_cfg = lmp.util.cfg.load(exp_name=model_cfg.tknzr_exp_name) # Load pre-trained tokenizer instance. tknzr = lmp.util.tknzr.load( exp_name=tknzr_cfg.exp_name, tknzr_name=tknzr_cfg.tknzr_name, ) # Load pre-trained model instance. model = lmp.util.model.load( ckpt=args.ckpt, tknzr=tknzr, **model_cfg.__dict__, ) # Get inference method. infer = lmp.util.infer.create( max_seq_len=model_cfg.max_seq_len, **args.__dict__, ) # Get model running device. device = torch.device('cpu') if torch.cuda.is_available(): device = torch.device('cuda') # Set model to evaluation model. # This turn off dropout layers in model. model = model.eval() # Move model to running device. model = model.to(device) # Generate text with specified inference method. txt = infer.gen(model=model, tknzr=tknzr, txt=args.txt) # Output generate text. print(txt)
def main() -> None: r"""Script entry point.""" # Parse command-line argument. args = parse_arg() # Get dataset instance with specified version. dset = lmp.util.dset.load(dset_name=args.dset_name, ver=args.ver) # Mini-batch random sampler. dldr = torch.utils.data.DataLoader( dataset=dset, batch_size=args.batch_size, shuffle=False, ) # Load pre-trained model configuration. model_cfg = lmp.util.cfg.load(exp_name=args.exp_name) # Load pre-trained tokenizer configuration. tknzr_cfg = lmp.util.cfg.load(exp_name=model_cfg.tknzr_exp_name) # Load pre-trained tokenizer instance. tknzr = lmp.util.tknzr.load( exp_name=tknzr_cfg.exp_name, tknzr_name=tknzr_cfg.tknzr_name, ) # Get model running device. device = torch.device('cpu') if torch.cuda.is_available(): device = torch.device('cuda') # Get tensorboard logger instance. writer = lmp.util.log.get_tb_logger(exp_name=args.exp_name) # Load pre-trained checkpoints range from `args.first_ckpt` to # `args.last_ckpt`. for ckpt in lmp.util.model.list_ckpts( exp_name=args.exp_name, first_ckpt=args.first_ckpt, last_ckpt=args.last_ckpt, ): # Load pre-trained model instance from checkpoint `ckpt`. model = lmp.util.model.load( ckpt=ckpt, tknzr=tknzr, **model_cfg.__dict__, ) # Set model to evaluation model. # This turn off dropout layers in model. model = model.eval() # Move model to running device. model = model.to(device) # Record average perplexity. avg_ppl = 0.0 for batch_txt in tqdm(dldr): # Encode batch text into batch of token ids. batch_tkids = tknzr.batch_enc( batch_txt=batch_txt, max_seq_len=model_cfg.max_seq_len, ) # Convert batch of token ids to `torch.Tensor` with # `dtype == torch.int64`. batch_tkids = torch.LongTensor(batch_tkids) # Move tensors to model running device. batch_tkids = batch_tkids.to(device) # Format batch token ids to satisfy language model training format. batch_prev_tkids = batch_tkids[..., :-1] batch_next_tkids = batch_tkids[..., 1:] # Calculate perplexity. batch_avg_ppl = model.ppl( batch_next_tkids=batch_next_tkids, batch_prev_tkids=batch_prev_tkids, ) # Accumulate average perplexity. avg_ppl += batch_avg_ppl * len(batch_txt) / len(dset) # Log average perplexity on dataset to CLI and tensorboard. writer.add_scalar(f'ppl/{args.dset_name}/{args.ver}', avg_ppl, ckpt) print(f'checkpoint {ckpt} ppl: {avg_ppl}')
def perplexity_eval( device: torch.device, model: Union[lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel], sequence: str, tokenizer: lmp.tokenizer.BaseTokenizer ) -> float: r"""Helper function for calculating perplexity. Args: device: Model running device. model: Language model. sequence: Sequence for evaluation. Must not be empty. tokenizer: Tokenizer for encoding sequence. Raises: TypeError: When one of the arguments are not an instance of their type annotation respectively. Return: Perplexity of `sequence`. """ # Type check. if not isinstance(device, torch.device): raise TypeError('`device` must be an instance of `torch.device`.') if not isinstance(model, ( lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel )): raise TypeError( '`model` must be an instance of ' '`Union[lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel]`.' ) if not isinstance(sequence, str): raise TypeError('`sequence` must be an instance of `str`.') if not isinstance(tokenizer, lmp.tokenizer.BaseTokenizer): raise TypeError( '`tokenizer` must be an instance of `lmp.tokenizer.BaseTokenizer`.' ) # Value check. if not sequence: raise ValueError('`sequence` must not be empty.') # Evalation mode. model.eval() # Encode sequence and convert into tensor. Original sequence length: S. # New sequence length: S + 2. sequence = tokenizer.encode(sequence, max_seq_len=-1) # `sequence[:-2]` means predict tokens include [bos] output but exclude # [eos] input. `x.shape = (S)`. x = torch.LongTensor(sequence[:-2]).to(device) # `y.shape = (S)`. y = sequence[1:-1] # Reshape into `(1, S)` to fit model. x = x.reshape(1, -1) # Get model vocabulary prediction with shape `(1, S, V)`. pred_y = model.predict(x) # Reshape into `(S)` for easier maniplation. x = x.squeeze(0) # Reshape into `(S, V)` for easier maniplation. pred_y = pred_y.squeeze(0) # Accumulate negative log-likelihood. nll = torch.zeros(1).to(device) # Iterate through each prediction. for pos, token_id in enumerate(y): probs = pred_y[pos, token_id] nll = nll - torch.log(probs) # Normalized by length. nll = nll / x.size(0) # Take exponential to cancel logarithmic. return nll.exp().item()
def main(argv: List[str]) -> None: """Script entry point. Parameters ---------- argv: list[str] List of CLI arguments. Returns ------- None """ # Parse CLI arguments. args = parse_args(argv=argv) # `args.batch_size` validation. lmp.util.validate.raise_if_wrong_ordered( vals=[1, args.batch_size], val_names=['1', 'args.batch_size']) # `args.first_ckpt` validation. lmp.util.validate.raise_if_wrong_ordered( vals=[-1, args.first_ckpt], val_names=['-1', 'args.first_ckpt']) # `args.last_ckpt` validation. lmp.util.validate.raise_if_wrong_ordered( vals=[-1, args.last_ckpt], val_names=['-1', 'args.last_ckpt']) # `args.n_worker` validation. lmp.util.validate.raise_if_wrong_ordered( vals=[0, args.n_worker, len(os.sched_getaffinity(0))], val_names=['0', 'args.n_worker', 'number of available CPUs'], ) lmp.util.validate.raise_if_wrong_ordered( vals=[args.n_worker, args.batch_size], val_names=['args.n_worker', 'args.batch_size'], ) # We use TCP to perform RPC. Timeout is set to 5 minutes. store = dist.TCPStore( is_master=args.rank == HOST_RANK, host_name=args.host_name, port=args.host_port, timeout=timedelta(minutes=5), world_size=args.world_size, ) # Use NCCL backend to perform CUDA collectives. dist.init_process_group( backend=dist.Backend.NCCL, store=store, rank=args.rank, timeout=timedelta(minutes=5), world_size=args.world_size, ) # Sync arguments. dist_args_k = [ 'host_name', 'host_port', 'local_rank', 'rank', 'world_size' ] for k in args.__dict__.keys(): if k in dist_args_k: continue # Host broadcast arguments. if args.rank == HOST_RANK: store.set(k, str(args.__dict__[k])) # Non-host receive host arguments. else: v = store.get(k) if isinstance(args.__dict__[k], str): args.__dict__[k] = v.decode('utf-8') else: args.__dict__[k] = type(args.__dict__[k])(v) # Set random seed for reproducibility. Note that each process use different seed to get different slice of batch. lmp.util.rand.set_seed(seed=args.seed + args.rank) # Get model running device. device = torch.device('cpu') if torch.cuda.is_available(): device = torch.device(f'cuda:{args.local_rank}') # Load pre-trained model configuration. model_cfg = lmp.util.cfg.load(exp_name=args.exp_name) # Load pre-trained tokenizer instance. tknzr = lmp.util.tknzr.load(exp_name=model_cfg.tknzr_exp_name) # Get dataset instance and convert samples to tensor. if args.is_dset_in_memory: dset: torch.utils.data.Dataset = lmp.util.dset.FastTensorDset( dset=lmp.util.dset.load(**args.__dict__), max_seq_len=model_cfg.max_seq_len, tknzr=tknzr, ) else: dset = lmp.util.dset.SlowTensorDset( dset=lmp.util.dset.load(**args.__dict__), max_seq_len=model_cfg.max_seq_len, tknzr=tknzr, ) dset_size = len(dset) # Mini-batch sampler. Each process will get batches exclusive to itself. dist_sampler = torch.utils.data.distributed.DistributedSampler( num_replicas=args.world_size, rank=args.rank, dataset=dset, shuffle=False, ) # Mini-batch distributed random sampler. Only when `args.n_worker > 0` we set `persisten_worker = True`. We set # `pin_memory = True` to speed up process (which only speed up a few seconds). data_loader = torch.utils.data.DataLoader( batch_size=args.batch_size // args.world_size, dataset=dset, num_workers=args.n_worker, persistent_workers=bool(args.n_worker != 0), pin_memory=True, sampler=dist_sampler, ) # Get tensorboard logger instance. Only main process need to log performance. if args.rank == HOST_RANK: writer = lmp.util.log.get_tb_logger(exp_name=args.exp_name) else: writer = None # Evaluate checkpoints within ranges. for ckpt in lmp.util.model.list_ckpts(exp_name=args.exp_name, first_ckpt=args.first_ckpt, last_ckpt=args.last_ckpt): # Load pre-trained model instance. model = lmp.util.model.load(ckpt=ckpt, exp_name=args.exp_name) # Set model to evaluation model. This turn off dropout layers in model. model = model.eval() # Move model to running device. model = model.to(device) # Create DDP model. dpp_model = torch.nn.parallel.DistributedDataParallel(model) # Processes can have unevenly distributed number of batch. Thus one must use `ddp_model.join()` to avoid dead lock. with dpp_model.join(): # Record average perplexity. avg_ppl = 0.0 for batch_tkids in tqdm(data_loader): # Encode text into token ids. We convert token ids into tensor and move to the same running device as model. batch_tkids = batch_tkids.to(device) # Format batch token ids to satisfy language model training format. batch_cur_tkids = batch_tkids[..., :-1] batch_next_tkids = batch_tkids[..., 1:] # Loop over token ids to get next token id prediction probability distribution. batch_prev_states = None batch_tkids_pd = [] for i in range(batch_cur_tkids.size(1)): batch_next_tkids_pd, batch_prev_states = model.pred( batch_cur_tkids=batch_cur_tkids[:, i], batch_prev_states=batch_prev_states, ) # Collect prediction probability distribution. batch_tkids_pd.append(batch_next_tkids_pd) # Calculate perplexity. batch_ppl = lmp.util.metric.ppl(batch_tkids=batch_next_tkids, batch_tkids_pd=torch.stack( batch_tkids_pd, dim=1)) # Sum `batch_ppl` from each process. dist.all_reduce(batch_ppl, op=dist.ReduceOp.SUM) # Accumulate average perplexity. avg_ppl += (batch_ppl / dset_size).sum().item() # Log average perplexity on dataset to CLI and tensorboard. Only main process need to log performance. if args.rank == HOST_RANK: writer.add_scalar(f'ppl/{args.dset_name}/{args.ver}', avg_ppl, ckpt) print(f'checkpoint: {ckpt}, avg ppl: {avg_ppl}') # Free memory. This is only need for unit test. del args del avg_ppl del batch_cur_tkids del batch_next_tkids del batch_next_tkids_pd del batch_ppl del batch_prev_states del batch_tkids del batch_tkids_pd del ckpt del data_loader del device del dset del dset_size del model del model_cfg del tknzr del writer torch.cuda.empty_cache() gc.collect()
def main(argv: List[str]) -> None: """Script entry point. Parameters ---------- argv: list[str] List of CLI arguments. Returns ------- None """ # Parse CLI arguments. args = parse_args(argv=argv) # `args.ckpt` validation. lmp.util.validate.raise_if_wrong_ordered(vals=[-1, args.ckpt], val_names=['-1', 'args.ckpt']) # `args.txt` validation. lmp.util.validate.raise_if_empty_str(val=args.txt, val_name='args.txt') # Set random seed for reproducibility. lmp.util.rand.set_seed(seed=args.seed) # Load pre-trained model configuration. model_cfg = lmp.util.cfg.load(exp_name=args.exp_name) # Load pre-trained tokenizer instance. tknzr = lmp.util.tknzr.load(exp_name=model_cfg.tknzr_exp_name) # Load pre-trained model instance. model = lmp.util.model.load(ckpt=args.ckpt, exp_name=args.exp_name) # Set model to evaluation model. This turn off dropout layers in model. model = model.eval() # Get model running device. device = torch.device('cpu') if torch.cuda.is_available(): device = torch.device('cuda') # Move model to running device. model = model.to(device) # Get inference method. infer = lmp.util.infer.create(**args.__dict__) # Generate text with specified inference method. txt = infer.gen(model=model, tknzr=tknzr, txt=args.txt) # Output generate text. print(txt) # Free memory. This is only need for unit test. del args del device del infer del model del model_cfg del tknzr del txt torch.cuda.empty_cache() gc.collect()
def analogy_inference(device: torch.device, model: Union[lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel], tokenizer: lmp.tokenizer.BaseTokenizer, word_a: str, word_b: str, word_c: str) -> str: r"""Generate analog word based on `word_a`, `word_b` and `word_c`. This function perform word analogy based on the following rule: `word_a` : `word_b` = `word_c` : `word_d` Where `word_d` is the prediction target. Args: device: Model running device. model: Language model. tokenizer: Converting token (including `word_a`, `word_b` and `word_c`) into token id and convert token id back to token (`word_d`). This is need since we use word embedding layer in our language model. word_a: word_b: word_c: Query words for word analogy. Raises: TypeError: When one of the arguments are not an instance of their type annotation respectively. Returns: Predict word following word analogy. """ # Type check. if not isinstance(device, torch.device): raise TypeError('`device` must be an instance of `torch.device`.') if not isinstance(model, (lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel)): raise TypeError( '`model` must be an instance of ' '`Union[lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel]`.') if not isinstance(tokenizer, lmp.tokenizer.BaseTokenizer): raise TypeError( '`tokenizer` must be an instance of `lmp.tokenizer.BaseTokenizer`.' ) if not isinstance(word_a, str): raise TypeError('`word_a` must be an instance of `str`.') if not isinstance(word_b, str): raise TypeError('`word_b` must be an instance of `str`.') if not isinstance(word_c, str): raise TypeError('`word_c` must be an instance of `str`.') # Evaluation mode. model.eval() model = model.to(device) # Convert tokens (query words) into token ids. word_a_id = torch.LongTensor([tokenizer.convert_token_to_id(word_a)]) word_b_id = torch.LongTensor([tokenizer.convert_token_to_id(word_b)]) word_c_id = torch.LongTensor([tokenizer.convert_token_to_id(word_c)]) # Perform analogy calculation. # Shape: `(1, E)`. out = (model.emb_layer(word_b_id.to(device)) - model.emb_layer(word_a_id.to(device)) + model.emb_layer(word_c_id.to(device))) # Calculate cosine similarity. # Shape: `(V)`. pred = torch.nn.functional.cosine_similarity( out, model.emb_layer.weight, ) # Get the token id with maximum consine similarity. # Shape: `(1)`. word_d_id = pred.argmax(dim=0).to('cpu').item() # Convert back to token. return tokenizer.convert_id_to_token(word_d_id)
def generate_sequence(beam_width: int, begin_of_sequence: str, device: torch.device, max_seq_len: int, model: lmp.model.BaseRNNModel, tokenizer: lmp.tokenizer.BaseTokenizer) -> List[str]: r"""Sequences generation using beam search. Args: beam_width: Number of candidate sequences to output. begin_of_sequence: Begining of sequence which model will auto-complete. device: Model running device. max_seq_len: Maximum of output sequences length. model: Language model. tokenizer: Tokenizer for encoding and decoding sequences. Returns: Generated sequences. """ # Evaluation mode. model.eval() # Encode sequence and convert into tensor. Remove [EOS] since we are using # begin of sentence. cur_seq = tokenizer.encode(begin_of_sequence, max_seq_len=-1) cur_seq = torch.LongTensor(cur_seq)[:-1].to(device) # Get begin sequence length. seq_len = cur_seq.size(-1) # Generated sequence. # Start shape (1, S). # Final shape (B, S). cur_seq = cur_seq.reshape(1, seq_len) # Accumulated negative log-likelihood. Using log can change consecutive # probability multiplication into sum of log probability which can # avoid computational underflow. Initialized to zero with shape (B). accum_prob = torch.zeros(beam_width).to(device) for _ in range(max_seq_len - seq_len): # Model prediction has shape (B, S, V). pred_y = model.predict(cur_seq) # Record all beams prediction. # Each beam will predict `beam_width` different results. # So we totally have `beam_width * beam_width` different results. top_k_in_all_beams = [] for out_beam in range(cur_seq.size(0)): # Get `beam_width` different prediction from beam `out_beam`. # `top_k_prob_in_beam` has shape (B) and # `top_k_index_in_beam` has shape (B). top_k_prob_in_beam, top_k_index_in_beam = \ pred_y[out_beam, -1].topk( k=beam_width, dim=-1 ) # Record each beam's negative log-likelihood and concate # next token id based on prediction. for in_beam in range(beam_width): # Accumulate negative log-likelihood. Since log out # negative value when input is in range 0~1, we negate it # to be postive. prob = accum_prob[out_beam] - \ top_k_prob_in_beam[in_beam].log() prob = prob.unsqueeze(0) # Concate next predicted token id. seq = torch.cat([ cur_seq[out_beam], top_k_index_in_beam[in_beam].unsqueeze(0) ], dim=-1).unsqueeze(0) # Record result. top_k_in_all_beams.append({'prob': prob, 'seq': seq}) # Compare each recorded result in all beams. First concate tensor # then use `topk` to get the `beam_width` highest prediction in all # beams. _, top_k_index_in_all_beams = torch.cat( [beam['prob'] for beam in top_k_in_all_beams]).topk(k=beam_width, dim=0) # Update `cur_seq` which is the `beam_width` highest results. cur_seq = torch.cat([ top_k_in_all_beams[index]['seq'] for index in top_k_index_in_all_beams ], dim=0) # Update accumlated negative log-likelihood. accum_prob = torch.cat([ top_k_in_all_beams[index]['prob'] for index in top_k_index_in_all_beams ], dim=0) return tokenizer.batch_decode(cur_seq.tolist())
def main() -> None: r"""Script entry point.""" # Parse command-line argument. args = parse_arg() # Load pre-trained model configuration. model_cfg = lmp.util.cfg.load(exp_name=args.exp_name) # Load pre-trained tokenizer configuration. tknzr_cfg = lmp.util.cfg.load(exp_name=model_cfg.tknzr_exp_name) # Load pre-trained tokenizer instance. tknzr = lmp.util.tknzr.load( exp_name=tknzr_cfg.exp_name, tknzr_name=tknzr_cfg.tknzr_name, ) # Load pre-trained model instance. model = lmp.util.model.load( ckpt=args.ckpt, tknzr=tknzr, **model_cfg.__dict__, ) # Get model running device. device = torch.device('cpu') if torch.cuda.is_available(): device = torch.device('cuda') # Set model to evaluation model. # This turn off dropout layers in model. model = model.eval() # Move model to running device. model = model.to(device) # Encode text into token ids. # Wrap as batch with only one sample since `model.ppl` only accept batch. batch_tkids = tknzr.batch_enc( batch_txt=[args.txt], max_seq_len=model_cfg.max_seq_len, ) # Convert token ids to `torch.Tensor` with `dtype == torch.int64`. batch_tkids = torch.LongTensor(batch_tkids) # Move tensors to model running device. batch_tkids = batch_tkids.to(device) # Format batch token ids to satisfy language model training format. batch_prev_tkids = batch_tkids[..., :-1] batch_next_tkids = batch_tkids[..., 1:] # Calculate perplexity. ppl = model.ppl( batch_next_tkids=batch_next_tkids, batch_prev_tkids=batch_prev_tkids, ) # Output perplexity on given sample. print(ppl)
def main(argv: List[str]) -> None: """Script entry point. Parameters ---------- argv: list[str] List of CLI arguments. Returns ------- None """ # Parse CLI arguments. args = parse_args(argv=argv) # `args.batch_size` validation. lmp.util.validate.raise_if_wrong_ordered(vals=[1, args.batch_size], val_names=['1', 'args.batch_size']) # `args.first_ckpt` validation. lmp.util.validate.raise_if_wrong_ordered(vals=[-1, args.first_ckpt], val_names=['-1', 'args.first_ckpt']) # `args.last_ckpt` validation. lmp.util.validate.raise_if_wrong_ordered(vals=[-1, args.last_ckpt], val_names=['-1', 'args.last_ckpt']) # `args.n_worker` validation. lmp.util.validate.raise_if_wrong_ordered( vals=[0, args.n_worker, len(os.sched_getaffinity(0))], val_names=['0', 'args.n_worker', 'number of available CPUs'], ) lmp.util.validate.raise_if_wrong_ordered( vals=[args.n_worker, args.batch_size], val_names=['args.n_worker', 'args.batch_size'], ) # Set random seed for reproducibility. lmp.util.rand.set_seed(seed=args.seed) # Get model running device. device = torch.device('cpu') if torch.cuda.is_available(): device = torch.device('cuda') # Load pre-trained model configuration. model_cfg = lmp.util.cfg.load(exp_name=args.exp_name) # Load pre-trained tokenizer instance. tknzr = lmp.util.tknzr.load(exp_name=model_cfg.tknzr_exp_name) # Get dataset instance and convert samples to tensor. if args.is_dset_in_memory: dset: torch.utils.data.Dataset = lmp.util.dset.FastTensorDset( dset=lmp.util.dset.load(**args.__dict__), max_seq_len=model_cfg.max_seq_len, tknzr=tknzr, ) else: dset = lmp.util.dset.SlowTensorDset( dset=lmp.util.dset.load(**args.__dict__), max_seq_len=model_cfg.max_seq_len, tknzr=tknzr, ) dset_size = len(dset) # Mini-batch sampler. Only when `args.n_worker > 0` we set `persisten_worker = True`. We set # `pin_memory = True` to speed up process (which only speed up a few seconds). data_loader = torch.utils.data.DataLoader( batch_size=args.batch_size, dataset=dset, shuffle=False, num_workers=args.n_worker, persistent_workers=bool(args.n_worker != 0), pin_memory=True, ) # Get tensorboard logger instance. writer = lmp.util.log.get_tb_logger(exp_name=args.exp_name) # Evaluate checkpoints within ranges. for ckpt in lmp.util.model.list_ckpts(exp_name=args.exp_name, first_ckpt=args.first_ckpt, last_ckpt=args.last_ckpt): # Load pre-trained model instance. model = lmp.util.model.load(ckpt=ckpt, exp_name=args.exp_name) # Set model to evaluation model. This turn off dropout layers in model. model = model.eval() # Move model to running device. model = model.to(device) # Record average perplexity. avg_ppl = 0.0 for batch_tkids in tqdm(data_loader): # Encode text into token ids. We convert token ids into tensor and move to the same running device as model. batch_tkids = batch_tkids.to(device) # Format batch token ids to satisfy language model training format. batch_cur_tkids = batch_tkids[..., :-1] batch_next_tkids = batch_tkids[..., 1:] # Loop over token ids to get next token id prediction probability distribution. batch_prev_states = None batch_tkids_pd = [] for i in range(batch_cur_tkids.size(1)): batch_next_tkids_pd, batch_prev_states = model.pred( batch_cur_tkids=batch_cur_tkids[:, i], batch_prev_states=batch_prev_states, ) # Collect prediction probability distribution. batch_tkids_pd.append(batch_next_tkids_pd) # Calculate perplexity. batch_ppl = lmp.util.metric.ppl(batch_tkids=batch_next_tkids, batch_tkids_pd=torch.stack(batch_tkids_pd, dim=1)) # Accumulate average perplexity. avg_ppl += (batch_ppl / dset_size).sum().item() # Log average perplexity on dataset to CLI and tensorboard. writer.add_scalar(f'ppl/{args.dset_name}/{args.ver}', avg_ppl, ckpt) print(f'checkpoint: {ckpt}, avg ppl: {avg_ppl}') # Free memory. This is only need for unit test. del args del avg_ppl del batch_cur_tkids del batch_next_tkids del batch_next_tkids_pd del batch_ppl del batch_prev_states del batch_tkids del batch_tkids_pd del ckpt del data_loader del device del dset del dset_size del model del model_cfg del tknzr del writer torch.cuda.empty_cache() gc.collect()
def generate_sequence( beam_width: int, begin_of_sequence: str, device: torch.device, max_seq_len: int, model: Union[lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel], tokenizer: lmp.tokenizer.BaseTokenizer ) -> List[str]: r"""Sequences generation using beam search. Args: beam_width: Number of candidate sequences to output. Must be bigger than or equal to `1`. begin_of_sequence: Begining of sequence which model will auto-complete. device: Model running device. max_seq_len: Maximum of output sequences length. Must be bigger than or equal to `2`. model: Language model. tokenizer: Tokenizer for encoding and decoding sequences. Raises: TypeError: When one of the arguments are not an instance of their type annotation respectively. ValueError: When one of the arguments do not follow their constraints. See docstring for arguments constraints. Returns: Generated sequences. """ # Type check. if not isinstance(beam_width, int): raise TypeError('`beam_width` must be an instance of `int`.') if not isinstance(begin_of_sequence, str): raise TypeError('`begin_of_sequence` must be an instance of `str`.') if not isinstance(device, torch.device): raise TypeError('`device` must be an instance of `torch.device`.') if not isinstance(max_seq_len, int): raise TypeError('`max_seq_len` must be an instance of `int`.') if not isinstance(model, ( lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel )): raise TypeError( '`model` must be an instance of ' '`Union[lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel]`.' ) if not isinstance(tokenizer, lmp.tokenizer.BaseTokenizer): raise TypeError( '`tokenizer` must be an instance of ' '`lmp.tokenizer.BaseTokenizer`.' ) # Value check. if beam_width < 1: raise ValueError('`beam_width` must be bigger than or equal to `1`.') if max_seq_len < 2: raise ValueError('`max_seq_len` must be bigger than or equal to `2`.') # Evaluation mode. model.eval() # Encode sequence and convert into tensor. Remove `[eos]`` since we are # using begin of sentence. cur_seq = tokenizer.encode(begin_of_sequence, max_seq_len=-1) cur_seq = torch.LongTensor(cur_seq)[:-1].to(device) # Get begin sequence length. seq_len = cur_seq.size(-1) # Generated sequence. # Start shape (1, S). # Final shape (B, S). cur_seq = cur_seq.reshape(1, seq_len) # Accumulated negative log-likelihood. Using log can change consecutive # probability multiplication into sum of log probability which can # avoid computational underflow. Initialized to zero with shape (B). accum_prob = torch.zeros(beam_width).to(device) for _ in range(max_seq_len - seq_len): # Model prediction has shape (B, S, V). pred_y = model.predict(cur_seq) # Record all beams prediction. # Each beam will predict `beam_width` different results. # So we totally have `beam_width * beam_width` different results. top_k_in_all_beams = [] for out_beam in range(cur_seq.size(0)): # Get `beam_width` different prediction from beam `out_beam`. # `top_k_prob_in_beam` has shape (B) and # `top_k_index_in_beam` has shape (B). top_k_prob_in_beam, top_k_index_in_beam = \ pred_y[out_beam, -1].topk( k=beam_width, dim=-1 ) # Record each beam's negative log-likelihood and concate # next token id based on prediction. for in_beam in range(beam_width): # Accumulate negative log-likelihood. Since log out # negative value when input is in range 0~1, we negate it # to be postive. prob = accum_prob[out_beam] - \ top_k_prob_in_beam[in_beam].log() prob = prob.unsqueeze(0) # Concate next predicted token id. seq = torch.cat([ cur_seq[out_beam], top_k_index_in_beam[in_beam].unsqueeze(0) ], dim=-1).unsqueeze(0) # Record result. top_k_in_all_beams.append({ 'prob': prob, 'seq': seq }) # Compare each recorded result in all beams. First concate tensor # then use `topk` to get the `beam_width` highest prediction in all # beams. _, top_k_index_in_all_beams = torch.cat([ beam['prob'] for beam in top_k_in_all_beams ]).topk(k=beam_width, dim=0) # Update `cur_seq` which is the `beam_width` highest results. cur_seq = torch.cat([ top_k_in_all_beams[index]['seq'] for index in top_k_index_in_all_beams ], dim=0) # Update accumlated negative log-likelihood. accum_prob = torch.cat([ top_k_in_all_beams[index]['prob'] for index in top_k_index_in_all_beams ], dim=0) return tokenizer.batch_decode(cur_seq.tolist())