示例#1
0
def evaluate(args, model, tokenizer, prefix=""):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_task_names = ("umlm",)
    eval_outputs_dirs = (args.output_dir,)

    results = {}
    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
        eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)

        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(eval_output_dir)

        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
        # Note that DistributedSampler samples randomly
        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

        # multi-gpu eval
        if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
            model = torch.nn.DataParallel(model, device_ids=args.cuda_ids)

        # Eval!
        logger.info("***** Running evaluation {} *****".format(prefix))
        logger.info("  Num examples = %d", len(eval_dataset))
        logger.info("  Batch size = %d", args.eval_batch_size)
        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            model.eval()
            batch = tuple(t.to(args.device) for t in batch)

            with torch.no_grad():
                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]

                eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs["labels"].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)

        eval_loss = eval_loss / nb_eval_steps
        preds = np.argmax(preds, axis=1)
        result = compute_metrics(eval_task, preds, out_label_ids)
        results.update(result)

        output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results {} *****".format(prefix))
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    return results
示例#2
0
def test(args, model, tokenizer, prefix=""):
    # Loop to handle MNLI double testing (matched, mis-matched)
    test_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
    test_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" \
        else (args.output_dir,)
    results = {}
    for test_task, test_output_dir in zip(test_task_names, test_outputs_dirs):
        test_output_dir = os.path.join(test_output_dir, prefix)
        test_dataset, guids = load_and_cache_examples(args, test_task, tokenizer, evaluate="test")
        if not os.path.exists(test_output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(test_output_dir)

        args.test_batch_size = args.per_gpu_test_batch_size * max(1, args.n_gpu)
        # Note that DistributedSampler samples randomly
        test_sampler = SequentialSampler(test_dataset) if args.local_rank == -1 else DistributedSampler(test_dataset)
        test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.test_batch_size)

        # multi-gpu test
        if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
            model = torch.nn.DataParallel(model)
        # Test!
        logger.info("***** Running testing {} *****".format(prefix))
        logger.info("  Num examplfrom_pretrainedes = %d", len(test_dataset))
        logger.info("  Batch size = %d", args.test_batch_size)
        test_loss = 0.0
        nb_test_steps = 0
        preds = None
        out_label_ids = None
        preds_normal = []
        for batch in tqdm(test_dataloader, desc="testuating"):
            model.eval()
            batch = tuple(t.to(args.device) for t in batch)

            with torch.no_grad():
                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
                if args.model_type != "distilbert":
                    inputs["token_type_ids"] = (
                        batch[2] if args.model_type in ["bert", "xlnet", "albert"] else None
                    )  # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids

                outputs = model(**inputs)
                tmp_test_loss, logits = outputs[:2]

                # 先归一化,再softmax
                # for item in torch.softmax(torch.nn.functional.normalize(logits, p=2, dim=1), dim=1).tolist():
                #     preds_normal.append(item)
                for item in torch.softmax(logits, dim=1).tolist():
                    preds_normal.append(item)

                test_loss += tmp_test_loss.mean().item()
            nb_test_steps += 1
            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs['labels'].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)

        test_loss = test_loss / nb_test_steps
        if args.output_mode == "classification":
            preds = np.argmax(preds, axis=1)
        elif args.output_mode == "regression":
            preds = np.squeeze(preds)
        processor = processors[args.task_name]()
        result = compute_metrics(test_task, preds, out_label_ids, processor.get_labels())
        results.update(result)

        output_test_file = os.path.join(test_output_dir, "test_report.txt")
        with open(output_test_file, "w") as writer:
            logger.info("***** Test results {} *****".format(prefix))
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
            writer.write("%s = %s\n" % ("loss", str(test_loss)))
        writer.close()

        if len(guids) == len(preds):
            if args.output_mode == "classification":
                preds = processor.label_index_to_label(preds)

                output_result_file = os.path.join(test_output_dir, "test_results.csv")
                with open(output_result_file, "w", encoding='utf-8', newline='') as result_file:
                    predict_file_writer = csv.writer(result_file, delimiter=',')
                    headers = ["id", "label"]
                    predict_file_writer.writerow(headers)
                    for index in range(len(guids)):
                        predict_file_writer.writerow([guids[index], preds[index]])
                    logger.info("Save " + test_output_dir + " down.")
                result_file.close()

                output_result_file = os.path.join(test_output_dir, "test_probability.csv")
                with open(output_result_file, "w", encoding='utf-8', newline='') as result_file:
                    predict_file_writer = csv.writer(result_file, delimiter=',')
                    headers = ["id"] + processor.get_labels()
                    predict_file_writer.writerow(headers)
                    for index in range(len(guids)):
                        predict_file_writer.writerow([guids[index]] + preds_normal[index])
                    logger.info("Save " + test_output_dir + " down.")
                    result_file.close()
            elif args.output_mode == "regression":
                output_result_file = os.path.join(test_output_dir, "test_regression.csv")
                with open(output_result_file, "w", encoding='utf-8', newline='') as result_file:
                    predict_file_writer = csv.writer(result_file, delimiter=',')
                    headers = ["id"] + processor.get_labels()
                    predict_file_writer.writerow(headers)
                    for index in range(len(guids)):
                        predict_file_writer.writerow([guids[index]] + preds_normal[index])
                    logger.info("Save " + test_output_dir + " down.")
                result_file.close()
        else:
            raise ValueError("The length of guid and the length of pred is not match: len(guid) = %s, len(pred) %s" % (
                len(guids), len(preds)))

    return results