Пример #1
0
def evaluate(model: TransformerModelWrapper,
             eval_data: List[InputExample],
             config: EvalConfig) -> Dict:

    metrics = config.metrics if config.metrics else ['acc']
    results = model.eval(eval_data=eval_data,
                         per_gpu_eval_batch_size=config.per_gpu_eval_batch_size,
                         n_gpu=config.n_gpu)
    # print("results['logits'].shape=", results['logits'].shape)
    predictions = np.argmax(results['logits'], axis=1)
    scores = {}
    for metric in metrics:
        if metric == 'acc':
            scores[metric] = simple_accuracy(predictions, results['labels'])
        elif metric == 'f1':
            scores[metric] = f1_score(results['labels'], predictions)
        elif metric == 'f1-macro':
            scores[metric] = f1_score(results['labels'], predictions, average='macro')
        elif metric == 'em':
            scores[metric] = exact_match(predictions, results['labels'], results['question_ids'])
        else:
            raise ValueError(f"Metric '{metric}' not implemented")
    results['scores'] = scores
    results['predictions'] = predictions
    return results
Пример #2
0
def evaluate(
    model: TransformerModelWrapper,
    eval_data: List[InputExample],
    config: EvalConfig,
    priming_data: List[InputExample] = None,
    local_rank=-1,
) -> Dict:
    """
    Evaluate a model.

    :param model: the model to evaluate
    :param eval_data: the examples for evaluation
    :param config: the evaluation config
    :param priming_data: an optional list of priming data to use
    :return: a dictionary containing the model's logits, predictions and (if any metrics are given) scores
    """

    if config.priming:
        for example in eval_data:
            example.meta["priming_data"] = priming_data

    metrics = config.metrics if config.metrics else ["acc"]
    device = torch.device(config.device if config.device else "cuda" if torch.
                          cuda.is_available() else "cpu")

    model.model.to(device)
    results = model.eval(
        eval_data,
        device,
        per_gpu_eval_batch_size=config.per_gpu_eval_batch_size,
        n_gpu=config.n_gpu,
        decoding_strategy=config.decoding_strategy,
        priming=config.priming,
        local_rank=local_rank,
    )

    predictions = np.argmax(results["logits"], axis=1)
    scores = {}

    for metric in metrics:
        if metric == "acc":
            scores[metric] = simple_accuracy(predictions, results["labels"])
        elif metric == "f1":
            scores[metric] = f1_score(results["labels"], predictions)
        elif metric == "f1-macro":
            scores[metric] = f1_score(results["labels"],
                                      predictions,
                                      average="macro")
        elif metric == "em":
            scores[metric] = exact_match(predictions, results["labels"],
                                         results["question_ids"])
        else:
            raise ValueError(f"Metric '{metric}' not implemented")

    results["scores"] = scores
    results["predictions"] = predictions
    return results
Пример #3
0
def evaluate(model: TransformerModelWrapper,
             eval_data: List[InputExample],
             config: EvalConfig,
             priming_data: List[InputExample] = None) -> Dict:
    """
    Evaluate a model.

    :param model: the model to evaluate
    :param eval_data: the examples for evaluation
    :param config: the evaluation config
    :param priming_data: an optional list of priming data to use
    :return: a dictionary containing the model's logits, predictions and (if any metrics are given) scores
    """

    if config.priming:
        for example in eval_data:
            example.meta['priming_data'] = priming_data

    metrics = config.metrics if config.metrics else ['acc']
    device = torch.device(config.device if config.device else "cuda" if torch.
                          cuda.is_available() else "cpu")

    model.model.to(device)
    results = model.eval(
        eval_data,
        device,
        per_gpu_eval_batch_size=config.per_gpu_eval_batch_size,
        n_gpu=config.n_gpu,
        decoding_strategy=config.decoding_strategy,
        priming=config.priming)

    predictions = np.argmax(results['logits'], axis=1)
    scores = {}

    for metric in metrics:
        if metric == 'acc':
            scores[metric] = simple_accuracy(predictions, results['labels'])
        elif metric == 'f1':
            scores[metric] = f1_score(results['labels'], predictions)
        elif metric == 'f1-macro':
            scores[metric] = f1_score(results['labels'],
                                      predictions,
                                      average='macro')
        elif metric == 'em':
            scores[metric] = exact_match(predictions, results['labels'],
                                         results['question_ids'])
        elif metric == 'dist-loss':
            if eval_data[0].logits is not None:
                scores[metric] = distillation_loss(
                    torch.tensor(results['logits']),
                    torch.stack([
                        torch.tensor(ex.logits, dtype=torch.float32)
                        for ex in eval_data
                    ]), config.temperature)
            else:
                scores[metric] = 0.
        else:
            raise ValueError(f"Metric '{metric}' not implemented")

    results['scores'] = scores
    results['predictions'] = predictions
    return results
Пример #4
0
def main():
    parser = argparse.ArgumentParser()

    # required parameters
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory. The verbalizers are written to a file 'verbalizer.json' in this directory.",
    )
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help="The input data dir. Should contain the data files for the task.",
    )
    parser.add_argument(
        "--model_type",
        default=None,
        type=str,
        required=True,
        help="The model type",
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name",
    )
    parser.add_argument(
        "--task_name",
        default=None,
        type=str,
        required=True,
        help="The name of the task to train selected in the list: " +
        ", ".join(PROCESSORS.keys()),
    )

    # verbalizer search hyperparameters
    parser.add_argument(
        "--normalize",
        action="store_true",
        help=
        "Whether to normalize the loss as proposed in the paper. It is recommended to set this to 'true'.",
    )
    parser.add_argument(
        "--combine_patterns",
        action="store_true",
        help=
        "If set to true, a single joint verbalizer is searched for all patterns",
    )
    parser.add_argument(
        "--num_candidates",
        default=1000,
        type=int,
        help=
        "The number of candidate tokens to consider as verbalizers (see Section 4.1 of the paper)",
    )
    parser.add_argument(
        "--words_per_label",
        default=10,
        type=int,
        help="The number of verbalizer tokens to assign to each label",
    )
    parser.add_argument(
        "--score_fct",
        default="llr",
        choices=["llr", "ce", "random"],
        help=
        "The function used to score verbalizers. Choices are: the log-likelihood ratio loss proposed in the paper "
        "('llr'), cross-entropy loss ('ce') and 'random', which assigns random tokens to each label.",
    )

    # other optional parameters
    parser.add_argument(
        "--train_examples",
        default=50,
        type=int,
        help=
        "The total number of train examples to use, where -1 equals all examples.",
    )
    parser.add_argument(
        "--pattern_ids",
        default=[0],
        type=int,
        nargs="+",
        help="The ids of the PVPs to be used",
    )
    parser.add_argument(
        "--max_seq_length",
        default=256,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.",
    )
    parser.add_argument(
        "--per_gpu_eval_batch_size",
        default=8,
        type=int,
        help="Batch size per GPU/CPU for evaluation.",
    )
    parser.add_argument(
        "--words_file",
        default=None,
        type=str,
        help=
        "Path to a file containing (unlabeled) texts from the task's domain. This text is used to compute "
        "verbalization candidates by selecting the most frequent words.",
    )
    parser.add_argument(
        "--max_words",
        default=10000,
        type=int,
        help=
        "Only the 10,000 tokens that occur most frequently in the task’s unlabeled data (see --words_file) are "
        "considered as verbalization candidates",
    )
    parser.add_argument(
        "--additional_input_examples",
        type=str,
        help=
        "An optional path to an additional set of input examples (e.g., obtained using iPET)",
    )
    parser.add_argument("--seed",
                        default=42,
                        type=int,
                        help="random seed for initialization")

    args = parser.parse_args()
    random.seed(args.seed)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    with open(os.path.join(args.output_dir, "config.txt"),
              "w",
              encoding="utf8") as fh:
        json.dump(args.__dict__, fh, indent=2)

    # setup gpu/cpu
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args.n_gpu = torch.cuda.device_count()

    # prepare task
    args.task_name = args.task_name.lower()
    if args.task_name not in PROCESSORS:
        raise ValueError("Task not found: {}".format(args.task_name))
    processor = PROCESSORS[args.task_name]()
    args.label_list = processor.get_labels()
    args.cache_dir = ""
    args.do_lower_case = False
    args.verbalizer_file = None
    args.wrapper_type = "mlm"

    # get training data
    train_examples_per_label = (eq_div(args.train_examples, len(
        args.label_list)) if args.train_examples != -1 else -1)
    train_data = load_examples(
        args.task_name,
        args.data_dir,
        set_type=TRAIN_SET,
        num_examples_per_label=train_examples_per_label,
    )
    if args.additional_input_examples:
        additional_data = InputExample.load_examples(
            args.additional_input_examples)
        train_data += additional_data
        logger.info(
            f"Loaded {len(additional_data)} additional examples from {args.additional_input_examples}, total"
            f"training set size is now {len(train_data)}")

    expected = {
        label: np.array([1 if x.label == label else 0 for x in train_data])
        for label in args.label_list
    }

    if args.words_file:
        with open(args.words_file, "r", encoding="utf8") as fh:
            word_counts = Counter(fh.read().split())
    else:
        word_counts = None

    tokenizer_class = MODEL_CLASSES[args.model_type]["tokenizer"]
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
    word2idx = get_word_to_id_map(tokenizer,
                                  word_counts=word_counts,
                                  max_words=args.max_words)

    logits = []

    for pattern_id in args.pattern_ids:
        logger.info(f"Processing examples with pattern id {pattern_id}...")
        args.pattern_id = pattern_id

        config = WrapperConfig(
            model_type=args.model_type,
            model_name_or_path=args.model_name_or_path,
            wrapper_type="mlm",
            task_name=args.task_name,
            max_seq_length=args.max_seq_length,
            label_list=args.label_list,
            pattern_id=args.pattern_id,
        )

        wrapper = TransformerModelWrapper(config)
        wrapper.model.to(device)
        # modify all patterns so that they return a single text segment instead of two segments
        get_parts = wrapper.preprocessor.pvp.get_parts
        wrapper.preprocessor.pvp.get_parts = lambda example: (
            get_parts(example)[0] + get_parts(example)[1],
            [],
        )
        wrapper.preprocessor.pvp.convert_mlm_logits_to_cls_logits = lambda mask, x, _=None: x[
            mask >= 0]

        pattern_logits = wrapper.eval(
            train_data,
            device,
            per_gpu_eval_batch_size=args.per_gpu_eval_batch_size,
            n_gpu=args.n_gpu,
        )["logits"]
        pattern_logits = pattern_logits - np.expand_dims(
            np.max(pattern_logits, axis=1), axis=1)
        logits.append(pattern_logits)

    logger.info("Starting verbalizer search...")

    if args.combine_patterns:
        avs = AutomaticVerbalizerSearch(word2idx, args.label_list, logits,
                                        expected)
        verbalizer = avs.find_verbalizer(
            num_candidates=args.num_candidates,
            words_per_label=args.words_per_label,
            normalize=args.normalize,
            score_fct=args.score_fct,
        )
        verbalizers = {
            pattern_id: verbalizer
            for pattern_id in args.pattern_ids
        }

    else:
        verbalizers = {}
        for idx, pattern_id in enumerate(args.pattern_ids):
            avs = AutomaticVerbalizerSearch(word2idx, args.label_list,
                                            [logits[idx]], expected)
            verbalizers[pattern_id] = avs.find_verbalizer(
                num_candidates=args.num_candidates,
                words_per_label=args.words_per_label,
                normalize=args.normalize,
                score_fct=args.score_fct,
            )

    print(json.dumps(verbalizers, indent=2))
    logger.info("Verbalizer search complete, writing output...")

    with open(os.path.join(args.output_dir, "verbalizers.json"),
              "w",
              encoding="utf8") as fh:
        json.dump(verbalizers, fh, indent=2)

    logger.info("Done")