def score_trads(preds, trg_loader, eval_kwargs): split = eval_kwargs.get('split', 'val') batch_size = eval_kwargs.get('batch_size', 80) verbose = eval_kwargs.get('verbose', 0) ground_truths = [] trg_loader.reset_iterator(split) n = 0 while True: # get batch data_trg = trg_loader.get_trg_batch(split, range(batch_size), batch_size) output_lines_trg_gold = data_trg['out_labels'] n += batch_size # Decode a minibatch greedily __TODO__ add beam search decoding # Do the same for gold sentences sent_gold = decode_sequence(trg_loader.get_vocab(), output_lines_trg_gold, eos=trg_loader.eos, bos=trg_loader.bos) if not verbose: verb = not (n % 1000) else: verb = verbose for (l, gl) in zip(preds, sent_gold): ground_truths.append(gl) if verb: lg.print_sampled("", gl, l) ix1 = data_trg['bounds']['it_max'] if data_trg['bounds']['wrapped']: break if n >= ix1: print('Evaluated the required samples (%s)' % n) break bleu_moses, _ = corpus_bleu(preds, ground_truths) scores = {'Bleu': bleu_moses} return scores
def track_model(job_name, model, src_loader, trg_loader, eval_kwargs): """Evaluate model.""" source = [] preds = [] ground_truths = [] batched_alphas = [] batched_aligns = [] batched_activ_aligns = [] batched_activs = [] batched_embed_activs = [] batch_size = eval_kwargs.get('batch_size', 1) assert batch_size == 1, "Batch size must be 1" split = eval_kwargs.get('split', 'val') verbose = eval_kwargs.get('verbose', 0) max_samples = eval_kwargs.get('max_samples', -1) eval_kwargs['BOS'] = trg_loader.bos eval_kwargs['EOS'] = trg_loader.eos eval_kwargs['PAD'] = trg_loader.pad eval_kwargs['UNK'] = trg_loader.unk print('src_loader ref:', src_loader.ref) remove_bpe = 'BPE' in src_loader.ref print('Removing bpe:', remove_bpe) logger = logging.getLogger(job_name) # Make sure to be in evaluation mode model.eval() offset = eval_kwargs.get('offset', 0) print('Starting from ', offset) src_loader.iterators[split] = offset trg_loader.iterators[split] = offset # src_loader.reset_iterator(split) # trg_loader.reset_iterator(split) n = 0 while True: # get batch data_src, order = src_loader.get_src_batch(split, batch_size) data_trg = trg_loader.get_trg_batch(split, order, batch_size) n += batch_size if model.version == 'seq2seq': source = model.encoder(data_src) source = model.map(source) batch_preds, _ = model.decoder.sample(source, eval_kwargs) else: # track returns seq, alphas, aligns, activ_aligns, activs, embed_activs, clean_cstr batch_preds, alphas, aligns, activ_aligns, activs, embed_activs, C = model.track(data_src, eval_kwargs) batched_alphas.append(alphas) batched_aligns.append(aligns) batched_activ_aligns.append(activ_aligns) batched_activs.append(activs) batched_embed_activs.append(embed_activs) # Initialize target with <BOS> for every sentence Index = 2 if isinstance(batch_preds, list): # wiht beam size unpadded preds sent_preds = [decode_sequence(trg_loader.get_vocab(), np.array(pred).reshape(1, -1), eos=trg_loader.eos, bos=trg_loader.bos, remove_bpe=False)[0] for pred in batch_preds] else: # decode sent_preds = decode_sequence(trg_loader.get_vocab(), batch_preds, eos=trg_loader.eos, bos=trg_loader.bos, remove_bpe=False) # Do the same for gold sentences sent_source = decode_sequence(src_loader.get_vocab(), data_src['labels'].data.cpu().numpy(), eos=src_loader.eos, bos=src_loader.bos, remove_bpe=False) source.append(sent_source) sent_gold = decode_sequence(trg_loader.get_vocab(), data_trg['out_labels'].data.cpu().numpy(), eos=trg_loader.eos, bos=trg_loader.bos, remove_bpe=False) if not verbose: verb = not (n % 300) else: verb = verbose for (sl, l, gl) in zip(sent_source, sent_preds, sent_gold): preds.append(l) ground_truths.append(gl) if verb: lg.print_sampled(sl, gl, l) if max_samples == -1: ix1 = data_src['bounds']['it_max'] else: ix1 = max_samples if data_src['bounds']['wrapped']: break if n >= ix1: logger.warn('Evaluated the required samples (%s)' % n) break print('Sampled %d sentences' % len(preds)) bleu_moses, _ = corpus_bleu(preds, ground_truths) return {'source': source, 'preds': preds, 'alpha': batched_alphas, 'align': batched_aligns, 'activ_align': batched_activ_aligns, 'activ': batched_activs, 'embed_activ': batched_embed_activs, 'channels_cst': C, "bleu": bleu_moses, }
def sample_model(job_name, model, src_loader, trg_loader, eval_kwargs): """Evaluate model.""" preds = [] ground_truths = [] batch_size = eval_kwargs.get('batch_size', 1) split = eval_kwargs.get('split', 'val') verbose = eval_kwargs.get('verbose', 0) eval_kwargs['BOS'] = trg_loader.bos eval_kwargs['EOS'] = trg_loader.eos eval_kwargs['PAD'] = trg_loader.pad eval_kwargs['UNK'] = trg_loader.unk remove_bpe = eval_kwargs.get('remove_bpe', True) logger = logging.getLogger(job_name) model.eval() src_loader.reset_iterator(split) trg_loader.reset_iterator(split) n = 0 start = time.time() lenpen_mode = eval_kwargs.get('lenpen_mode', 'wu') scorer = GNMTGlobalScorer(eval_kwargs['lenpen'], 0, 'none', lenpen_mode) while True: # get batch data_src, order = src_loader.get_src_batch(split, batch_size) data_trg = trg_loader.get_trg_batch(split, order, batch_size) n += batch_size if model.version == 'seq2seq': source = model.encoder(data_src) source = model.map(source) batch_preds, _ = model.decoder.sample(source, scorer, eval_kwargs) else: batch_preds, _ = model.sample(data_src, scorer, eval_kwargs) torch.cuda.empty_cache() # FIXME choose an optimal freq # Initialize target with <BOS> for every sentence Index = 2 if isinstance(batch_preds, list): # wiht beam size unpadded preds sent_preds = [decode_sequence(trg_loader.get_vocab(), np.array(pred).reshape(1, -1), eos=trg_loader.eos, bos=trg_loader.bos, remove_bpe=remove_bpe)[0] for pred in batch_preds] else: # decode sent_preds = decode_sequence(trg_loader.get_vocab(), batch_preds, eos=trg_loader.eos, bos=trg_loader.bos, remove_bpe=remove_bpe) # Do the same for gold sentences sent_source = decode_sequence(src_loader.get_vocab(), data_src['labels'], eos=src_loader.eos, bos=src_loader.bos, remove_bpe=remove_bpe) sent_gold = decode_sequence(trg_loader.get_vocab(), data_trg['out_labels'], eos=trg_loader.eos, bos=trg_loader.bos, remove_bpe=remove_bpe) if not verbose: verb = not (n % 1000) else: verb = verbose for (sl, l, gl) in zip(sent_source, sent_preds, sent_gold): preds.append(l) ground_truths.append(gl) if verb: lg.print_sampled(sl, gl, l) ix1 = data_src['bounds']['it_max'] # ix1 = 20 if data_src['bounds']['wrapped']: break if n >= ix1: break del sent_source, sent_preds, sent_gold, batch_preds logger.warn('Sampled %d sentences in %.2f s', len(preds), time.time() - start) bleu_moses, _ = corpus_bleu(preds, ground_truths) return preds, bleu_moses
def evaluate_model(job_name, trainer, src_loader, trg_loader, eval_kwargs): """Evaluate model.""" preds = [] ground_truths = [] batch_size = eval_kwargs.get('batch_size', 1) max_samples = eval_kwargs.get('max_samples', -1) split = eval_kwargs.get('split', 'val') verbose = eval_kwargs.get('verbose', 0) eval_kwargs['BOS'] = trg_loader.bos eval_kwargs['EOS'] = trg_loader.eos eval_kwargs['PAD'] = trg_loader.pad eval_kwargs['UNK'] = trg_loader.unk logger = logging.getLogger(job_name) # Make sure to be in evaluation mode model = trainer.model crit = trainer.criterion model.eval() src_loader.reset_iterator(split) trg_loader.reset_iterator(split) n = 0 loss_sum = 0 ml_loss_sum = 0 loss_evals = 0 start = time.time() while True: # get batch data_src, order = src_loader.get_src_batch(split, batch_size) data_trg = trg_loader.get_trg_batch(split, order, batch_size) n += batch_size if model.version == 'seq2seq': source = model.encoder(data_src) source = model.map(source) if trainer.criterion.version == "seq": losses, stats = crit(model, source, data_trg) else: # ML & Token-level # init and forward decoder combined decoder_logit = model.decoder(source, data_trg) losses, stats = crit(decoder_logit, data_trg['out_labels']) batch_preds, _ = model.sample(source, eval_kwargs) else: losses, stats = crit(model(data_src, data_trg), data_trg['out_labels']) batch_preds, _ = model.sample(data_src, eval_kwargs) loss_sum += losses['final'].data.item() ml_loss_sum += losses['ml'].data.item() loss_evals = loss_evals + 1 # Initialize target with <BOS> for every sentence Index = 2 if isinstance(batch_preds, list): # wiht beam size unpadded preds sent_preds = [decode_sequence(trg_loader.get_vocab(), np.array(pred).reshape(1, -1), eos=trg_loader.eos, bos=trg_loader.bos)[0] for pred in batch_preds] else: # decode sent_preds = decode_sequence(trg_loader.get_vocab(), batch_preds, eos=trg_loader.eos, bos=trg_loader.bos) # Do the same for gold sentences sent_source = decode_sequence(src_loader.get_vocab(), data_src['labels'], eos=src_loader.eos, bos=src_loader.bos) sent_gold = decode_sequence(trg_loader.get_vocab(), data_trg['out_labels'], eos=trg_loader.eos, bos=trg_loader.bos) if not verbose: verb = not (n % 1000) else: verb = verbose for (sl, l, gl) in zip(sent_source, sent_preds, sent_gold): preds.append(l) ground_truths.append(gl) if verb: lg.print_sampled(sl, gl, l) if max_samples == -1: ix1 = data_src['bounds']['it_max'] else: ix1 = max_samples if data_src['bounds']['wrapped']: break if n >= ix1: break logger.warn('Evaluated %d samples in %.2f s', len(preds), time.time()-start) bleu_moses, _ = corpus_bleu(preds, ground_truths) return preds, ml_loss_sum / loss_evals, loss_sum / loss_evals, bleu_moses
def evaluate_split(job_name, trainer, loader, eval_kwargs): """Evaluate model.""" preds = [] ground_truths = [] max_samples = eval_kwargs.get('max_samples', -1) verbose = eval_kwargs.get('verbose', 0) logger = logging.getLogger(job_name) src_loader = loader.src trg_loader = loader.trg # Make sure to be in evaluation mode model = trainer.model crit = trainer.criterion model.eval() n = 0 loss_sum = 0 ml_loss_sum = 0 loss_evals = 0 start = time.time() while True: # get batch sample = loader.get_batch() data_src = sample["src"] data_trg = sample["trg"] ntokens = sample['ntokens'] del sample print('Eval ntokens:', ntokens, "batch:", data_src['labels'].size(0)) n += data_src['labels'].size(0) if model.version == 'seq2seq': source = model.encoder(data_src) source = model.map(source) if trainer.criterion.version == "seq": losses, stats = crit(model, source, data_trg) else: # ML & Token-level # init and forward decoder combined decoder_logit = model.decoder(source, data_trg) losses, stats = crit(decoder_logit, data_trg['out_labels']) batch_preds, _ = model.sample(source, eval_kwargs) else: losses, stats = crit(model(data_src, data_trg), data_trg['out_labels']) batch_preds, _ = model.sample(data_src, eval_kwargs) loss_sum += losses['final'].data.item() ml_loss_sum += losses['ml'].data.item() loss_evals = loss_evals + 1 torch.cuda.empty_cache() # FIXME choose an optimal freq # Initialize target with <BOS> for every sentence Index = 2 if isinstance(batch_preds, list): # wiht beam size unpadded preds sent_preds = [ decode_sequence(trg_loader.get_vocab(), np.array(pred).reshape(1, -1), eos=trg_loader.eos, bos=trg_loader.bos)[0] for pred in batch_preds ] else: # decode sent_preds = decode_sequence(trg_loader.get_vocab(), batch_preds, eos=trg_loader.eos, bos=trg_loader.bos) # Do the same for gold sentences sent_source = decode_sequence(src_loader.get_vocab(), data_src['labels'], eos=src_loader.eos, bos=src_loader.bos) sent_gold = decode_sequence(trg_loader.get_vocab(), data_trg['out_labels'], eos=trg_loader.eos, bos=trg_loader.bos) if not verbose: verb = not (n % 1000) else: verb = verbose for (sl, l, gl) in zip(sent_source, sent_preds, sent_gold): preds.append(l) ground_truths.append(gl) if verb: lg.print_sampled(sl, gl, l) if max_samples == -1: ix1 = data_src['bounds']['it_max'] else: ix1 = max_samples if data_src['bounds']['wrapped']: break if n >= ix1: break # print('Predictions lenght:', len(preds), len(ground_truths)) # assert(len(preds) == trg_loader.h5_file['labels_val'].shape[0]) logger.warn('Evaluated %d samples in %.2f s', len(preds), time.time() - start) bleu_moses, _ = corpus_bleu(preds, ground_truths) return preds, ml_loss_sum / loss_evals, loss_sum / loss_evals, bleu_moses
def evaluate_loader(job_name, trainer, loader, src_dict, trg_dict, eval_kwargs): """Evaluate model.""" preds = [] ground_truths = [] max_samples = eval_kwargs.get('max_samples', math.inf) verbose = eval_kwargs.get('verbose', 0) logger = logging.getLogger(job_name) # Make sure to be in evaluation mode model = trainer.model crit = trainer.criterion model.eval() n = 0 loss_sum = 0 ml_loss_sum = 0 loss_evals = 0 start = time.time() for i, sample in enumerate(loader, start=0): # get batch data_src = { "labels": sample['net_input']['src_tokens'].cuda(), "lengths": sample['net_input']['src_lengths'].cuda() } data_trg = { "labels": sample['net_input']['prev_output_tokens'].cuda(), "out_labels": sample['target'].cuda(), "lengths": sample['net_input']['src_lengths'].cuda( ) # modify loader to return trg lengths as well TODO } del sample # print("batch:", data_src['labels'].size(0)) n += data_src['labels'].size(0) if model.version == 'seq2seq': source = model.encoder(data_src) source = model.map(source) if trainer.criterion.version == "seq": losses, stats = crit(model, source, data_trg) else: # ML & Token-level # init and forward decoder combined decoder_logit = model.decoder(source, data_trg) losses, stats = crit(decoder_logit, data_trg['out_labels']) batch_preds, _ = model.sample(source, eval_kwargs) else: losses, stats = crit(model(data_src, data_trg), data_trg['out_labels']) batch_preds, _ = model.sample(data_src, eval_kwargs) loss_sum += losses['final'].data.item() ml_loss_sum += losses['ml'].data.item() loss_evals = loss_evals + 1 torch.cuda.empty_cache() # FIXME choose an optimal freq # Initialize target with <BOS> for every sentence Index = 2 if isinstance(batch_preds, list): # wiht beam size unpadded preds sent_preds = [ decode_sequence(trg_dict, np.array(pred).reshape(1, -1), eos=trg_dict.eos(), bos=trg_dict.eos())[0] for pred in batch_preds ] else: # decode sent_preds = decode_sequence(trg_dict, batch_preds, eos=trg_dict.eos(), bos=trg_dict.eos()) # Do the same for gold sentences sent_source = decode_sequence(src_dict, data_src['labels'], eos=src_dict.eos(), bos=src_dict.eos()) sent_gold = decode_sequence(trg_dict, data_trg['out_labels'], eos=trg_dict.eos(), bos=trg_dict.eos()) if not verbose: verb = not (n % 1000) else: verb = verbose for (sl, l, gl) in zip(sent_source, sent_preds, sent_gold): preds.append(l) ground_truths.append(gl) if verb: lg.print_sampled(sl, gl, l) if n > max_samples: break # print('Predictions lenght:', len(preds), len(ground_truths)) # assert(len(preds) == trg_loader.h5_file['labels_val'].shape[0]) logger.warn('Evaluated %d samples in %.2f s', len(preds), time.time() - start) bleu_moses, _ = corpus_bleu(preds, ground_truths) return preds, ml_loss_sum / loss_evals, loss_sum / loss_evals, bleu_moses