def eval_samples(raw_samples, tokenizer):
    """Evaluates generated samples."""
    gt_refs = []
    samples = []

    groups = group_samples(raw_samples, tokenizer)
    groups = list(groups.values())
    avg_group_size = np.mean([len(g[-1]) for g in groups])
    logging.info('Average samples per example: %.2f', avg_group_size)
    avg_group_size = int(math.ceil(avg_group_size))
    for i, (gt, s) in enumerate(groups):
        gt_refs.append(gt)
        idx = i % len(groups)
        samples.append(groups[idx][-1])

    gt_bleu, gt_n_grams = utils.compute_bleu(samples, gt_refs)

    logging.info('Processed %d samples in total.',
                 sum([len(s) for s in samples]))
    flat_samples = []
    for s in samples:
        flat_samples.extend(s)
    logging.info('Average sample len: %.2f',
                 np.mean([len(s) for s in flat_samples]))
    logging.info('Average ground-truth len: %.2f',
                 np.mean([len(gt) for gt in gt_refs]))

    logging.info('Ground-truth BLEU: %6.2f, n-gram precision: (%s)',
                 gt_bleu * 100,
                 ', '.join(['%6.2f%%' % (s * 100) for s in gt_n_grams]))
def validate(model,
             dev_data,
             vocab_src,
             vocab_tgt,
             epoch,
             config,
             direction=None):
    model.eval()
    device = torch.device(
        "cpu") if config["device"] == "cpu" else torch.device("cuda:0")
    with torch.no_grad():
        model_hypotheses = []
        references = []

        val_dl = DataLoader(dev_data,
                            batch_size=config["batch_size_eval"],
                            shuffle=False,
                            num_workers=4)
        val_dl = BucketingParallelDataLoader(val_dl)
        for sentences_x, sentences_y in val_dl:
            if direction == None or direction == "xy":
                x_in, _, x_mask, x_len = create_batch(sentences_x, vocab_src,
                                                      device)
                x_mask = x_mask.unsqueeze(1)
            else:
                x_in, _, x_mask, x_len = create_batch(sentences_y, vocab_src,
                                                      device)
                x_mask = x_mask.unsqueeze(1)

            enc_output, enc_hidden = model.encode(x_in, x_len)
            dec_hidden = model.init_decoder(enc_output, enc_hidden)

            raw_hypothesis = beam_search(model.decoder, model.emb_tgt,
                                         model.generate_tm, enc_output,
                                         dec_hidden, x_mask, vocab_tgt.size(),
                                         vocab_tgt[SOS_TOKEN],
                                         vocab_tgt[EOS_TOKEN],
                                         vocab_tgt[PAD_TOKEN], config)

            hypothesis = batch_to_sentences(raw_hypothesis, vocab_tgt)
            model_hypotheses += hypothesis.tolist()

            if direction == None or direction == "xy":
                references += sentences_y.tolist()
            else:
                references += sentences_x.tolist()

        save_hypotheses(model_hypotheses, epoch, config)
        model_hypotheses, references = clean_sentences(model_hypotheses,
                                                       references, config)
        bleu = compute_bleu(model_hypotheses, references, epoch, config,
                            direction)
        return bleu
示例#3
0
    return parser


if __name__ == '__main__':
    # generate parser / parse parameters
    parser = get_parser()
    params = parser.parse_args()

    # check parameters
    assert os.path.isfile(params.ref)
    assert os.path.isfile(params.hyp)

    refs = []
    with open(params.ref) as ref:
        tmp = []
        for line in ref.readlines():
            if line != "\n":
                tmp.append(line.strip().split())
            else:
                refs.append(tmp)
                tmp = []

    hyps = [line.strip().split() for line in open(params.hyp).readlines()]

    r = compute_bleu(reference_corpus=refs,
                     translation_corpus=hyps,
                     max_order=params.max_order,
                     smooth=params.smooth)
    print(r)
示例#4
0
    def validate(self, val_loader, epoch=0):
        self.model.eval()
        val_loss = 0.0
        total_acc = 0.0
        total_recall = 0.0
        total_precision = 0.0
        total_f1 = 0.0
        total_cm = 0
        total_d_acc = 0.0
        bleu = 0.0
        total_l1 = 0
        total_l2 = 0
        total_l3 = 0

        k_vals = [1, 2, 3, 4, 5]
        total_topk = {k: 0.0 for k in k_vals}
        per_disease_topk = defaultdict(lambda: {str(k): 0.0 for k in k_vals})
        per_disease_bleu = defaultdict(list)
        with torch.no_grad():
            for i, (_, images, labels, f_labels,
                    text) in enumerate(val_loader):
                batch_size = images.size(0)
                images = images.to(self.device)
                labels = labels.to(self.device)
                f_labels = f_labels.to(self.device)
                text = text.to(self.device)
                diseases, fine_diseases, text_pred = self.model(images, text)
                loss1 = self.criterion(diseases, labels)
                loss2 = self.criterion(fine_diseases, f_labels)
                text_loss = 0.0
                for k in range(text_pred.size(1)):
                    text_loss += self.criterion(text_pred[:, k].squeeze(),
                                                text[:, k + 1].squeeze())

                val_loss += torch.stack(
                    (loss1, loss2, text_loss))[self.tasks].sum()

                preds = F.log_softmax(fine_diseases, dim=-1)
                pred = preds.argmax(dim=-1)
                d_pred = F.log_softmax(diseases, dim=-1).argmax(dim=-1)

                # Evaluation of P, R, F1, CM, BLEU
                total_acc += (pred.eq(f_labels).sum().item() / batch_size)
                total_d_acc += (d_pred.eq(labels).sum().item() / batch_size)
                acc, recall, precision, f1 = accuracy_recall_precision_f1(
                    d_pred, labels)
                cm = calculate_confusion_matrix(d_pred, labels)
                try:
                    total_cm += (cm / batch_size)
                except:
                    print("Error occured for this CM")
                    print(cm / batch_size)

                # Top-k evaluation
                for k in k_vals:
                    total_topk[k] += compute_topk(preds, f_labels, k)
                    for d in [0, 1, 2, 3]:
                        mask = labels.eq(d)
                        if mask.sum() > 0:
                            per_disease_topk[d][str(k)] += compute_topk(
                                preds[mask], f_labels[mask], k)

                total_recall += np.mean(recall)
                total_precision += np.mean(precision)
                total_f1 += np.mean(f1)
                preds = torch.argmax(F.log_softmax(text_pred, dim=-1), dim=-1)
                text1 = text[:, 1:].squeeze().tolist()
                preds1 = preds.tolist()
                t_bleu, sent_gt, sent_pred = compute_bleu(
                    self.lang, text1, preds1, labels, per_disease_bleu)

                # Book-keeping
                bleu += t_bleu
                total_l1 += loss1.item()
                total_l2 += loss2.item()
                total_l3 += text_loss.item()
        bleu = bleu / (len(val_loader))
        val_loss = val_loss / len(val_loader)
        total_l1 /= len(val_loader)
        total_l2 /= len(val_loader)
        total_l3 /= len(val_loader)
        total_acc = total_acc / len(val_loader)
        total_d_acc = total_d_acc / len(val_loader)
        total_f1 = total_f1 / len(val_loader)
        total_precision = total_precision / len(val_loader)
        total_recall = total_recall / len(val_loader)
        total_cm = total_cm / len(val_loader)

        self.scheduler.step(val_loss)
        if val_loss <= self.min_val_loss:
            torch.save(self.model.state_dict(), self.save_path)
            self.min_val_loss = val_loss

        disease_f1 = {}
        disease_precision = {}
        disease_recall = {}

        #for i in range(len(total_f1)):
        #   disease_f1[i] = total_f1[i]
        #   disease_precision[i] = total_precision[i]
        #   disease_recall[i] = total_recall[i]
        for d in per_disease_bleu:
            per_disease_bleu[d] = np.mean(per_disease_bleu[d])

        total_topk = {str(k): total_topk[k] / len(val_loader) for k in k_vals}
        for d in [0, 1, 2, 3]:
            for k in k_vals:
                per_disease_topk[d][str(
                    k)] = per_disease_topk[d][str(k)] / len(val_loader)

        return (val_loss, total_d_acc, total_acc, bleu, total_f1, total_recall,
                total_precision, sent_gt, sent_pred, total_topk,
                per_disease_topk, per_disease_bleu, total_cm)
示例#5
0
def main():

    global args, max_length
    args = parser.parse_args()

    if args.eval:

        if not os.path.exists(args.output_dir):
            print("Output directory do not exists")
            exit(0)
        try:
            model = EncoderDecoder().load(args.output_dir)
            print("Model loaded successfully")
        except:
            print("The trained model could not be loaded...")
            exit()

        test_pairs = readFile(args.test_file)

        outputs = model.evaluatePairs(test_pairs, rand=False, char=args.char)
        writeToFile(outputs, os.path.join(args.output_dir, "output.pkl"))
        reference = []
        hypothesis = []

        for (hyp, ref) in outputs:
            if args.char or args.char_bleu:
                reference.append([list(ref)])
                hypothesis.append(list(hyp))
            else:
                reference.append([ref.split(" ")])
                hypothesis.append(hyp.split(" "))

        bleu_score = compute_bleu(reference, hypothesis)
        print("Bleu Score: " + str(bleu_score))

        print(
            model.evaluateAndShowAttention(
                "L'anglais n'est pas facile pour nous.", char=args.char))
        print(
            model.evaluateAndShowAttention(
                "J'ai dit que l'anglais est facile.", char=args.char))
        print(
            model.evaluateAndShowAttention(
                "Je n'ai pas dit que l'anglais est une langue facile.",
                char=args.char))
        print(
            model.evaluateAndShowAttention("Je fais un blocage sur l'anglais.",
                                           char=args.char))

    else:
        input_lang, output_lang, pairs = prepareData(args.train_file)

        print(random.choice(pairs))

        if args.char:
            model = EncoderDecoder(args.hidden_size, input_lang.n_chars,
                                   output_lang.n_chars, args.drop, args.tfr,
                                   args.max_length, args.lr, args.simple,
                                   args.bidirectional, args.dot, False, 1)
        else:
            model = EncoderDecoder(args.hidden_size, input_lang.n_words,
                                   output_lang.n_words, args.drop, args.tfr,
                                   args.max_length, args.lr, args.simple,
                                   args.bidirectional, args.dot, args.multi,
                                   args.num_layers)

        model.trainIters(pairs,
                         input_lang,
                         output_lang,
                         args.n_iters,
                         print_every=args.print_every,
                         plot_every=args.plot_every,
                         char=args.char)
        model.save(args.output_dir)
        model.evaluatePairs(pairs, char=args.char)
def validate(model,
             dev_data,
             vocab_src,
             vocab_tgt,
             epoch,
             config,
             direction=None):
    model.eval()
    device = torch.device(
        "cpu") if config["device"] == "cpu" else torch.device("cuda:0")
    with torch.no_grad():
        model_hypotheses = []
        references = []

        val_dl = DataLoader(dev_data,
                            batch_size=config["batch_size_eval"],
                            shuffle=False,
                            num_workers=2)
        val_dl = BucketingParallelDataLoader(val_dl)
        val_kl = 0
        for sentences_x, sentences_y in val_dl:
            if direction == None or direction == "xy":
                x_in, _, x_mask, x_len = create_batch(sentences_x, vocab_src,
                                                      device)
                x_mask = x_mask.unsqueeze(1)
            else:
                x_in, _, x_mask, x_len = create_batch(sentences_y, vocab_src,
                                                      device)
                x_mask = x_mask.unsqueeze(1)

            qz = model.inference(x_in, x_mask, x_len)
            z = qz.mean

            pz = torch.distributions.Normal(loc=model.prior_loc,
                                            scale=model.prior_scale).expand(
                                                qz.mean.size())
            kl_loss = torch.distributions.kl.kl_divergence(qz, pz)
            kl_loss = kl_loss.sum(dim=1)
            val_kl += kl_loss.sum(dim=0)

            enc_output, enc_hidden = model.encode(x_in, x_len, z)
            dec_hidden = model.init_decoder(enc_output, enc_hidden, z)

            raw_hypothesis = beam_search(model.decoder, model.emb_tgt,
                                         model.generate_tm, enc_output,
                                         dec_hidden, x_mask, vocab_tgt.size(),
                                         vocab_tgt[SOS_TOKEN],
                                         vocab_tgt[EOS_TOKEN],
                                         vocab_tgt[PAD_TOKEN], config, z)

            hypothesis = batch_to_sentences(raw_hypothesis, vocab_tgt)
            model_hypotheses += hypothesis.tolist()

            if direction == None or direction == "xy":
                references += sentences_y.tolist()
            else:
                references += sentences_x.tolist()

        val_kl /= len(dev_data)
        save_hypotheses(model_hypotheses, epoch, config, direction)
        model_hypotheses, references = clean_sentences(model_hypotheses,
                                                       references, config)
        bleu = compute_bleu(model_hypotheses,
                            references,
                            epoch,
                            config,
                            direction,
                            kl=val_kl)
        return bleu
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        '--data_dir',
        default=None,
        type=str,
        required=True,
        help=
        'The input data dir. Should contain the .tsv files (or other data files) for the task.'
    )
    parser.add_argument(
        '--action_generator_model_type',
        default=None,
        type=str,
        required=True,
        choices=list(MODEL_CLASSES.keys()),
        help=
        'Model type to use for initial action prediction, selected in the list: '
        + ', '.join(MODEL_CLASSES.keys()))
    parser.add_argument(
        '--consequence_generator_model_type',
        default=None,
        type=str,
        required=True,
        choices=list(MODEL_CLASSES.keys()),
        help=
        'Model type to use for consequence prediction, selected in the list: '
        + ', '.join(MODEL_CLASSES.keys()))
    parser.add_argument(
        '--action_refiner_model_type',
        default=None,
        type=str,
        required=True,
        choices=list(MODEL_CLASSES.keys()),
        help=
        'Model type to use for refined action prediction, selected in the list: '
        + ', '.join(MODEL_CLASSES.keys()))
    parser.add_argument(
        '--action_classifier_model_type',
        default=None,
        type=str,
        required=True,
        choices=list(MODEL_CLASSES.keys()),
        help=
        'Model type to use for action classification, selected in the list: ' +
        ', '.join(MODEL_CLASSES.keys()))
    parser.add_argument(
        '--consequence_classifier_model_type',
        default=None,
        type=str,
        required=True,
        choices=list(MODEL_CLASSES.keys()),
        help=
        'Model type to use for consequence classification, selected in the list: '
        + ', '.join(MODEL_CLASSES.keys()))
    parser.add_argument(
        '--action_generator_checkpoint',
        default=None,
        type=str,
        required=True,
        help='Path to pre-trained model used for initial action generation')
    parser.add_argument(
        '--consequence_generator_checkpoint',
        default=None,
        type=str,
        required=True,
        help='Path to pre-trained model used for consequence generation')
    parser.add_argument(
        '--action_refiner_checkpoint',
        default=None,
        type=str,
        required=True,
        help='Path to pre-trained model used for initial action generation')
    parser.add_argument(
        '--action_classifier_checkpoint',
        default=None,
        type=str,
        required=True,
        help='Path to pre-trained model used for action classification')
    parser.add_argument(
        '--consequence_classifier_checkpoint',
        default=None,
        type=str,
        required=True,
        help='Path to pre-trained model used for consequence classification')
    parser.add_argument(
        '--split_name',
        default=None,
        type=str,
        required=True,
        choices=SPLITS,
        help='The name of the data split used to train / evaluate the model: '
        + ', '.join(SPLITS))
    parser.add_argument(
        '--output_dir',
        default=None,
        type=str,
        required=True,
        help=
        'The root output directory where the model predictions and checkpoints will be written.'
    )

    ## Generation parameters
    parser.add_argument(
        '--max_gen_length',
        default=60,
        type=int,
        help='The maximum length of the sequence to be generated.')
    parser.add_argument(
        '--temperature',
        default=1.0,
        type=float,
        help='The value used to module the next token probabilities.')
    parser.add_argument(
        '--k',
        default=0,
        type=int,
        help=
        'The number of highest probability vocabulary tokens to keep for top-k-filtering.'
    )
    parser.add_argument(
        '--p',
        default=0,
        type=float,
        help=
        'If set to float < 1, only the most probable tokens with probabilities that add up to '
        'top_p or higher are kept for generation.')
    parser.add_argument('--num_beams',
                        default=0,
                        type=int,
                        required=False,
                        help='beams for beam search')
    parser.add_argument(
        '--do_sample',
        action='store_true',
        help=
        'Whether to generate predictions via sampling; if off, decoding is done greedily.'
    )
    parser.add_argument(
        '--sc101_action_embeddings_path',
        default=None,
        type=str,
        help=
        'Path to the file containing the Social-Chemistry-101 action embeddings.'
    )
    parser.add_argument(
        '--num_actions',
        default=0,
        type=int,
        required=False,
        help=
        'number of actions to ge generated for a single story prefix prior to ranking'
    )
    parser.add_argument(
        '--predict_consequences',
        action='store_true',
        help=
        'Whether to use consequences when ranking predicted action alternatives.'
    )

    ## Other parameters
    parser.add_argument(
        '--config_name',
        default='',
        type=str,
        help='Pretrained config name or path if not the same as model_name')
    parser.add_argument(
        '--tokenizer_name',
        default='',
        type=str,
        help='Pretrained tokenizer name or path if not the same as model_name')
    parser.add_argument(
        '--cache_dir',
        default='',
        type=str,
        help=
        'The cache directory where do you want to store the pre-trained models downloaded from s3'
    )
    parser.add_argument(
        '--max_seq_length',
        default=128,
        type=int,
        help=
        'The maximum total input sequence length after tokenization. Sequences longer '
        'than this will be truncated, sequences shorter will be padded.')
    parser.add_argument(
        '--do_lower_case',
        action='store_true',
        help='Set this flag if you are using an uncased model.')
    parser.add_argument('--data_cache_dir',
                        default=None,
                        type=str,
                        help='The root directory for caching features.')

    parser.add_argument('--per_gpu_eval_batch_size',
                        default=8,
                        type=int,
                        help='Batch size per GPU/CPU for evaluation.')

    parser.add_argument(
        '--eval_all_checkpoints',
        action='store_true',
        help=
        'Evaluate all checkpoints starting with the same prefix as model_name ending and ending '
        'with step number')
    parser.add_argument('--no_cuda',
                        action='store_true',
                        help='Avoid using CUDA when available')
    parser.add_argument('--overwrite_output_dir',
                        action='store_true',
                        help='Overwrite the content of the output directory')
    parser.add_argument(
        '--overwrite_cache',
        action='store_true',
        help='Overwrite the cached training and evaluation sets')
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help='random seed for initialization')

    parser.add_argument(
        '--fp16',
        action='store_true',
        help=
        'Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit'
    )
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        'For fp16: Apex AMP optimization level selected in [\'O0\', \'O1\', \'O2\', and \'O3\'].'
        'See details at https://nvidia.github.io/apex/amp.html')
    parser.add_argument('--local_rank',
                        type=int,
                        default=-1,
                        help='For distributed training: local_rank')
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help='For distant debugging.')
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help='For distant debugging.')
    args = parser.parse_args()

    # Check if directories need to be created
    args.original_data_dir = args.data_dir
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Setup distant debugging, if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        logging.info('Waiting for debugger attach ...')
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    device = torch.device(
        'cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu')
    args.n_gpu = torch.cuda.device_count()
    args.device = device

    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        'Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s',
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)

    # Set seed
    set_seed(args)

    # Generate with refinement
    logger.info('Generating actions through iterative refinement:')
    initial_actions, refined_actions = action_refinement_with_ranking(
        args, 'test')
    logger.info('Self-BLEU between initial and refined action predictions:')
    logging.info(compute_bleu(initial_actions, refined_actions))
    logger.info('***** Experiment finished *****')
def eval_bleu(
    train_loader: d.BatchedIterator,
    valid_loader: d.BatchedIterator,
    model: nn.Module,
    en_vocab: Vocabulary,
    fr_vocab: Vocabulary,
    device: str,
    multi_gpu: bool,
    eval_fast: bool,
    output_file: str,
) -> None:
    model = model.to(device)

    if output_file is not None:
        output_file = open(output_file, 'w')

    if multi_gpu and device == 'cuda':
       print('Using multi gpu training')
       model = torch.nn.DataParallel(model, device_ids=[0, 1]).cuda()
    
    bleus = []
    count = 0
    with tqdm(train_loader, total=len(train_loader)) as pbar:
        for i, data in enumerate(pbar):
            if i == 0:
                continue
            src, src_lengths = data.src
            trg, trg_lengths = data.trg

            if eval_fast:
                predicted = model.generate_max(src, src_lengths, 100, device)
            else:
                predicted = model.slow_generate(src, src_lengths, 100, device)
            # predicted = (torch.Tensor(src.size(0), 100).uniform_() * (len(fr_vocab) - 1)).long()
            # predicted = predicted * 
            # predicted = model.generate_beam(src, src_lengths, 100, 5, device)
            pred_arr = utils.torchtext_convert_to_str(predicted.cpu().numpy(), fr_vocab)[0]
            out_arr = utils.torchtext_convert_to_str(trg.cpu().numpy(), fr_vocab)[0]
            pred_slim_arr = utils.get_raw_sentence(pred_arr)
            out_slim_arr = utils.get_raw_sentence(out_arr)
            curr_bleu = utils.compute_bleu(pred_slim_arr, out_slim_arr)
            bleus.append(curr_bleu)

            if output_file is not None:
                src_arr = utils.torchtext_convert_to_str(src.cpu().numpy(), en_vocab)[0]
                src_slim_arr = utils.get_raw_sentence(src_arr)
                output = ' '.join(pred_slim_arr)
                actual_out = ' '.join(out_slim_arr)
                src = ' '.join(src_slim_arr)
                
                entry_str ='''
{DELIM}
    BLEU = {BLEU}
    src = {src}
    target = {target}
    predicted = {pred} 
'''
                entry_str = entry_str.format(
                    DELIM=utils.create_entry_delim(),
                    BLEU=curr_bleu * 100,
                    src=src,
                    target=actual_out,
                    pred=output,
                )

                output_file.write(entry_str)
            count += 1
            pbar.set_postfix(
                curr_bleu=curr_bleu * 100,
                avg_bleu=(sum(bleus) / len(bleus) * 100)
            )
            pbar.refresh()

    if output_file is not None:
        output_file.write(
            utils.create_entry_delim() + "\n"
        )

        output_file.write(
            'Average BLEU: {}\n'.format((sum(bleus) / len(bleus) * 100))
        )
        output_file.close()