def load_and_cache_examples(args, task, tokenizer, data_type='train'): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache processor = processors[task]() output_mode = output_modes[task] label_list = processor.get_labels() if data_type == 'train': examples = processor.get_train_examples(args.data_dir) elif data_type == 'dev': examples = processor.get_dev_examples(args.data_dir) else: examples = processor.get_test_examples(args.data_dir) features = convert_examples_to_features(examples, tokenizer, label_list=label_list, max_seq_length=args.max_seq_length, output_mode = output_mode) if args.local_rank == 0 and not evaluate: torch.distributed.barrier() all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_lens = torch.tensor([f.input_len for f in features], dtype=torch.long) all_labels = torch.tensor([f.label for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_lens, all_labels) return dataset
def load_and_cache_examples(args, task, tokenizer, data_type='train'): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, 'cached_{}_{}_{}_{}'.format( data_type, list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task))) if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) # os.makedirs(cached_features_file, exist_ok=True, mode=0o777) if task == "ner": label_list = processor.get_labels_ner(args.data_dir, args.label_with_bi) else: label_list = processor.get_labels() if task in ['mnli', 'mnli-mm'] and 'roberta' in args.model_type: # HACK(label indices are swapped in RoBERTa pretrained model) label_list[1], label_list[2] = label_list[2], label_list[1] if data_type == 'train': examples = processor.get_train_examples(args.data_dir) elif data_type == 'dev': examples = processor.get_dev_examples(args.data_dir) else: examples = processor.get_test_examples(args.data_dir) features = convert_examples_to_features( examples, tokenizer, label_list=label_list, max_seq_length=args.max_seq_length, output_mode=output_mode) if args.local_rank == 0 and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_lens = torch.tensor([f.input_len for f in features], dtype=torch.long) all_labels = torch.tensor([f.label for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_lens, all_labels) return dataset
def load_and_cache_examples(args, task, tokenizer, evaluate=False): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, "cached_{}_{}_{}_{}".format( "dev" if evaluate else "train", list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length), str(task), ), ) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() if task in ["mnli", "mnli-mm" ] and args.model_type in ["roberta", "xlmroberta"]: # HACK(label indices are swapped in RoBERTa pretrained model) label_list[1], label_list[2] = label_list[2], label_list[1] examples = (processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)) features = convert_examples_to_features( examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode, pad_on_left=False, # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0], pad_token_segment_id=0, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) if output_mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif output_mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset
def load_and_cache_examples(args, data, task, evaluate=False, test=False): ''' ALUE: - dev - MDD: read "test" - test -XNLI: both "test" and "diag" ''' if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache main_task = task.split("-")[0] output_mode = output_modes[main_task] # Load data features from cache or dataset file data_type = "dev" if evaluate else "train" if test: data_type = "test" cached_features_file = os.path.join( "features", "cached_{}_{}_{}_{}".format( data_type, list(filter(None, args.model_path.split("/"))).pop(), str(args.max_seq_length), str(task), ), ) if not os.path.exists("features"): os.makedirs("features") if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: label_list = tasks_label_list[main_task] data_key = main_task.upper() if test: if task == "xnli-diag": examples = data[data_key]['diag'] else: examples = data[data_key]['test'] elif evaluate: examples = data[data_key]['dev'] else: examples = data[data_key]['train'] features = convert_examples_to_features( examples, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode, pad_on_left=False, # pad on the left for xlnet pad_token=0, pad_token_segment_id=0, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) if test: all_labels = torch.tensor([-1 for f in features], dtype=torch.long) elif output_mode in ["classification", "multilabel"]: all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif output_mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset
def load_and_cache_examples(args, task, tokenizer, data_type='train'): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, # and the others will use the cache processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, 'cached_{}_{}_{}_{}'.format( data_type, list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task))) if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() if data_type == 'train': examples = processor.get_train_examples(args.data_dir) elif data_type == 'dev': examples = processor.get_dev_examples(args.data_dir) else: examples = processor.get_test_examples(args.data_dir) features = convert_examples_to_features( examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode, pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0], pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_lens = torch.tensor([f.input_len for f in features], dtype=torch.long) if output_mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif output_mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_lens, all_labels) return dataset
def for_server(args, text:str, task:str): if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) args.n_gpu = 1 args.device = device text = ['']+[['0']+['0']+[text]] processor = processors[task]() output_mode = output_modes[task] label_list = processor.get_labels() # num_labels = len(label_list) examples = processor._create_examples(text, 'predict') if args.local_rank in [-1, 0]: tokenizer = tokenization_albert.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case, ) checkpoints = [(0, args.output_dir)] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) checkpoints = [(int(checkpoint.split('-')[-1]), checkpoint) for checkpoint in checkpoints if checkpoint.find('checkpoint') != -1] checkpoints = sorted(checkpoints, key=lambda x: x[0]) logger.info("Evaluate the following checkpoints: %s", checkpoints) if len(checkpoints) == 0: checkpoints = [(0, args.output_dir)] else: checkpoints = [checkpoints[-1]] for _, checkpoint in checkpoints: model = AlbertForSequenceClassification.from_pretrained(checkpoint) model.to(args.device) features = convert_examples_to_features(examples, tokenizer, label_list=label_list, max_seq_length=args.max_seq_length, output_mode=output_mode) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_lens = torch.tensor([f.input_len for f in features], dtype=torch.long) all_labels = torch.tensor([f.label for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_lens, all_labels) for step, batch in enumerate(dataset): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {'input_ids': batch[0].unsqueeze(0), 'attention_mask': batch[1].unsqueeze(0), } inputs['token_type_ids'] = batch[2].unsqueeze(0) outputs = model(**inputs) logits = outputs[0] logits = logits.detach().cpu().numpy() preds = np.argmax(logits, axis=1) label = tasks_num_labels[task][preds[0]] return label