def evaluate(model: TransformerModelWrapper, eval_data: List[InputExample], config: EvalConfig) -> Dict: metrics = config.metrics if config.metrics else ['acc'] results = model.eval(eval_data=eval_data, per_gpu_eval_batch_size=config.per_gpu_eval_batch_size, n_gpu=config.n_gpu) # print("results['logits'].shape=", results['logits'].shape) predictions = np.argmax(results['logits'], axis=1) scores = {} for metric in metrics: if metric == 'acc': scores[metric] = simple_accuracy(predictions, results['labels']) elif metric == 'f1': scores[metric] = f1_score(results['labels'], predictions) elif metric == 'f1-macro': scores[metric] = f1_score(results['labels'], predictions, average='macro') elif metric == 'em': scores[metric] = exact_match(predictions, results['labels'], results['question_ids']) else: raise ValueError(f"Metric '{metric}' not implemented") results['scores'] = scores results['predictions'] = predictions return results
def evaluate( model: TransformerModelWrapper, eval_data: List[InputExample], config: EvalConfig, priming_data: List[InputExample] = None, local_rank=-1, ) -> Dict: """ Evaluate a model. :param model: the model to evaluate :param eval_data: the examples for evaluation :param config: the evaluation config :param priming_data: an optional list of priming data to use :return: a dictionary containing the model's logits, predictions and (if any metrics are given) scores """ if config.priming: for example in eval_data: example.meta["priming_data"] = priming_data metrics = config.metrics if config.metrics else ["acc"] device = torch.device(config.device if config.device else "cuda" if torch. cuda.is_available() else "cpu") model.model.to(device) results = model.eval( eval_data, device, per_gpu_eval_batch_size=config.per_gpu_eval_batch_size, n_gpu=config.n_gpu, decoding_strategy=config.decoding_strategy, priming=config.priming, local_rank=local_rank, ) predictions = np.argmax(results["logits"], axis=1) scores = {} for metric in metrics: if metric == "acc": scores[metric] = simple_accuracy(predictions, results["labels"]) elif metric == "f1": scores[metric] = f1_score(results["labels"], predictions) elif metric == "f1-macro": scores[metric] = f1_score(results["labels"], predictions, average="macro") elif metric == "em": scores[metric] = exact_match(predictions, results["labels"], results["question_ids"]) else: raise ValueError(f"Metric '{metric}' not implemented") results["scores"] = scores results["predictions"] = predictions return results
def evaluate(model: TransformerModelWrapper, eval_data: List[InputExample], config: EvalConfig, priming_data: List[InputExample] = None) -> Dict: """ Evaluate a model. :param model: the model to evaluate :param eval_data: the examples for evaluation :param config: the evaluation config :param priming_data: an optional list of priming data to use :return: a dictionary containing the model's logits, predictions and (if any metrics are given) scores """ if config.priming: for example in eval_data: example.meta['priming_data'] = priming_data metrics = config.metrics if config.metrics else ['acc'] device = torch.device(config.device if config.device else "cuda" if torch. cuda.is_available() else "cpu") model.model.to(device) results = model.eval( eval_data, device, per_gpu_eval_batch_size=config.per_gpu_eval_batch_size, n_gpu=config.n_gpu, decoding_strategy=config.decoding_strategy, priming=config.priming) predictions = np.argmax(results['logits'], axis=1) scores = {} for metric in metrics: if metric == 'acc': scores[metric] = simple_accuracy(predictions, results['labels']) elif metric == 'f1': scores[metric] = f1_score(results['labels'], predictions) elif metric == 'f1-macro': scores[metric] = f1_score(results['labels'], predictions, average='macro') elif metric == 'em': scores[metric] = exact_match(predictions, results['labels'], results['question_ids']) elif metric == 'dist-loss': if eval_data[0].logits is not None: scores[metric] = distillation_loss( torch.tensor(results['logits']), torch.stack([ torch.tensor(ex.logits, dtype=torch.float32) for ex in eval_data ]), config.temperature) else: scores[metric] = 0. else: raise ValueError(f"Metric '{metric}' not implemented") results['scores'] = scores results['predictions'] = predictions return results
def main(): parser = argparse.ArgumentParser() # required parameters parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory. The verbalizers are written to a file 'verbalizer.json' in this directory.", ) parser.add_argument( "--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the data files for the task.", ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="The model type", ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name", ) parser.add_argument( "--task_name", default=None, type=str, required=True, help="The name of the task to train selected in the list: " + ", ".join(PROCESSORS.keys()), ) # verbalizer search hyperparameters parser.add_argument( "--normalize", action="store_true", help= "Whether to normalize the loss as proposed in the paper. It is recommended to set this to 'true'.", ) parser.add_argument( "--combine_patterns", action="store_true", help= "If set to true, a single joint verbalizer is searched for all patterns", ) parser.add_argument( "--num_candidates", default=1000, type=int, help= "The number of candidate tokens to consider as verbalizers (see Section 4.1 of the paper)", ) parser.add_argument( "--words_per_label", default=10, type=int, help="The number of verbalizer tokens to assign to each label", ) parser.add_argument( "--score_fct", default="llr", choices=["llr", "ce", "random"], help= "The function used to score verbalizers. Choices are: the log-likelihood ratio loss proposed in the paper " "('llr'), cross-entropy loss ('ce') and 'random', which assigns random tokens to each label.", ) # other optional parameters parser.add_argument( "--train_examples", default=50, type=int, help= "The total number of train examples to use, where -1 equals all examples.", ) parser.add_argument( "--pattern_ids", default=[0], type=int, nargs="+", help="The ids of the PVPs to be used", ) parser.add_argument( "--max_seq_length", default=256, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.", ) parser.add_argument( "--words_file", default=None, type=str, help= "Path to a file containing (unlabeled) texts from the task's domain. This text is used to compute " "verbalization candidates by selecting the most frequent words.", ) parser.add_argument( "--max_words", default=10000, type=int, help= "Only the 10,000 tokens that occur most frequently in the task’s unlabeled data (see --words_file) are " "considered as verbalization candidates", ) parser.add_argument( "--additional_input_examples", type=str, help= "An optional path to an additional set of input examples (e.g., obtained using iPET)", ) parser.add_argument("--seed", default=42, type=int, help="random seed for initialization") args = parser.parse_args() random.seed(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) with open(os.path.join(args.output_dir, "config.txt"), "w", encoding="utf8") as fh: json.dump(args.__dict__, fh, indent=2) # setup gpu/cpu device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() # prepare task args.task_name = args.task_name.lower() if args.task_name not in PROCESSORS: raise ValueError("Task not found: {}".format(args.task_name)) processor = PROCESSORS[args.task_name]() args.label_list = processor.get_labels() args.cache_dir = "" args.do_lower_case = False args.verbalizer_file = None args.wrapper_type = "mlm" # get training data train_examples_per_label = (eq_div(args.train_examples, len( args.label_list)) if args.train_examples != -1 else -1) train_data = load_examples( args.task_name, args.data_dir, set_type=TRAIN_SET, num_examples_per_label=train_examples_per_label, ) if args.additional_input_examples: additional_data = InputExample.load_examples( args.additional_input_examples) train_data += additional_data logger.info( f"Loaded {len(additional_data)} additional examples from {args.additional_input_examples}, total" f"training set size is now {len(train_data)}") expected = { label: np.array([1 if x.label == label else 0 for x in train_data]) for label in args.label_list } if args.words_file: with open(args.words_file, "r", encoding="utf8") as fh: word_counts = Counter(fh.read().split()) else: word_counts = None tokenizer_class = MODEL_CLASSES[args.model_type]["tokenizer"] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) word2idx = get_word_to_id_map(tokenizer, word_counts=word_counts, max_words=args.max_words) logits = [] for pattern_id in args.pattern_ids: logger.info(f"Processing examples with pattern id {pattern_id}...") args.pattern_id = pattern_id config = WrapperConfig( model_type=args.model_type, model_name_or_path=args.model_name_or_path, wrapper_type="mlm", task_name=args.task_name, max_seq_length=args.max_seq_length, label_list=args.label_list, pattern_id=args.pattern_id, ) wrapper = TransformerModelWrapper(config) wrapper.model.to(device) # modify all patterns so that they return a single text segment instead of two segments get_parts = wrapper.preprocessor.pvp.get_parts wrapper.preprocessor.pvp.get_parts = lambda example: ( get_parts(example)[0] + get_parts(example)[1], [], ) wrapper.preprocessor.pvp.convert_mlm_logits_to_cls_logits = lambda mask, x, _=None: x[ mask >= 0] pattern_logits = wrapper.eval( train_data, device, per_gpu_eval_batch_size=args.per_gpu_eval_batch_size, n_gpu=args.n_gpu, )["logits"] pattern_logits = pattern_logits - np.expand_dims( np.max(pattern_logits, axis=1), axis=1) logits.append(pattern_logits) logger.info("Starting verbalizer search...") if args.combine_patterns: avs = AutomaticVerbalizerSearch(word2idx, args.label_list, logits, expected) verbalizer = avs.find_verbalizer( num_candidates=args.num_candidates, words_per_label=args.words_per_label, normalize=args.normalize, score_fct=args.score_fct, ) verbalizers = { pattern_id: verbalizer for pattern_id in args.pattern_ids } else: verbalizers = {} for idx, pattern_id in enumerate(args.pattern_ids): avs = AutomaticVerbalizerSearch(word2idx, args.label_list, [logits[idx]], expected) verbalizers[pattern_id] = avs.find_verbalizer( num_candidates=args.num_candidates, words_per_label=args.words_per_label, normalize=args.normalize, score_fct=args.score_fct, ) print(json.dumps(verbalizers, indent=2)) logger.info("Verbalizer search complete, writing output...") with open(os.path.join(args.output_dir, "verbalizers.json"), "w", encoding="utf8") as fh: json.dump(verbalizers, fh, indent=2) logger.info("Done")