def load_and_cache_examples(args, processor, tokenizer, evaluate=False): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, 'cached_{}_{}'.format('dev' if evaluate else 'train', str(args.max_seq_length))) if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() examples = processor.get_dev_examples( args.data_dir) if evaluate else processor.get_train_examples( args.data_dir) features = convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_length=args.max_seq_length, label_list=label_list, output_mode='classification', pad_on_left=False, pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0], pad_token_segment_id=0) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_labels = torch.tensor([f.label for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset
def load_and_cache_examples(args, task, tokenizer, evaluate=False): processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file if not os.path.exists("./dataset_cached"): os.makedirs("./dataset_cached") cached_features_file = os.path.join( "./dataset_cached", 'cached_{}_{}_{}_{}'.format( 'dev' if evaluate else 'train', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task))) if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']: # HACK(label indices are swapped in RoBERTa pretrained model) label_list[1], label_list[2] = label_list[2], label_list[1] examples = processor.get_dev_examples(args.data_dir) if evaluate else \ processor.get_train_examples(args.data_dir) features = convert_examples_to_features( examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode, ) logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) # all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) if output_mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif output_mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) # dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) dataset = TensorDataset(all_input_ids, all_attention_mask, all_labels) return dataset
def load_and_cache_examples(args, task, tokenizer, evaluate=False): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache dataset = fetch_20newsgroups(subset='test' if evaluate else 'train', remove=('headers', 'footers', 'quotes')) data, target = dataset.data, dataset.target assert len(data) == len(target) examples = [] # input('Something:') for i, inst in enumerate(data): if i % 2000 == 0: print('{}/{}'.format(i, len(data))) examples.append(InputExample(guid='', text_a=data[i], label=target[i])) # input('Something') features = convert_examples_to_features( examples, tokenizer, label_list=list(range(20)), max_length=args.max_seq_length, output_mode="classification", pad_on_left=False, # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, ) if args.local_rank == 0 and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) output_mode = "classification" if output_mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif output_mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset
def load_pred_examples(args, data_dir, task, tokenizer, evaluate=True): processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file cached_features_file = os.path.join( data_dir, 'cached_{}_{}_{}_{}'.format( 'dev' if evaluate else 'train', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task))) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", data_dir) label_list = processor.get_labels() if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']: # HACK(label indices are swapped in RoBERTa pretrained model) label_list[1], label_list[2] = label_list[2], label_list[1] examples = processor.get_dev_examples(data_dir) features = convert_examples_to_features( examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode, # pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet # pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], # pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) # all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) if output_mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif output_mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_attention_mask, all_labels) return dataset
def load_and_cache_examples(task, tokenizer, data_dir, model_name_or_path, max_seq_length, model_type, evaluate=False, overwrite_cache=False): processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file cached_features_file = os.path.join( data_dir, "cached_{}_{}_{}_{}".format( "dev" if evaluate else "train", list(filter(None, model_name_or_path.split("/"))).pop(), str(max_seq_length), str(task), ), ) if os.path.exists(cached_features_file) and not overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", data_dir) label_list = processor.get_labels() if task in ["mnli", "mnli-mm"] and model_type in ["roberta", "xlmroberta"]: # HACK(label indices are swapped in RoBERTa pretrained model) label_list[1], label_list[2] = label_list[2], label_list[1] examples = ( processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir) ) features = convert_examples_to_features( examples, tokenizer, label_list=label_list, max_length=max_seq_length, output_mode=output_mode, pad_on_left=bool(model_type in ["xlnet"]), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if model_type in ["xlnet"] else 0, ) logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) if output_mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif output_mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) return TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
def load_and_cache_examples(args, task, tokenizer, evaluate=False): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}_{}'.format( 'dev' if evaluate else 'train', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task), str(args.num_train_examples))) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir, args.num_train_examples) features = convert_examples_to_features(examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) # all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) if args.model_type != "roberta": all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) if output_mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif output_mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) # dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) if args.model_type == "roberta": dataset = TensorDataset(all_input_ids, all_attention_mask, all_labels) else: dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset
def load_and_cache_examples(args, task, tokenizer, evaluate=False): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file cached_features_file = os.path.join(args.train, 'cached_{}_{}_{}_{}'.format( 'dev' if evaluate else 'train', list(filter(None, args.model_name.split('/'))).pop(), str(args.max_seq_length), str(task))) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.train) label_list = processor.get_labels() if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']: # HACK(label indices are swapped in RoBERTa pretrained model) label_list[1], label_list[2] = label_list[2], label_list[1] print(os.listdir(args.train)) examples = processor.get_dev_examples(args.validation) if evaluate else processor.get_train_examples(args.train) features = convert_examples_to_features(examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode, pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) if output_mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif output_mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset
def load_train_examples(args, task, tokenizer, processor): if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache # 训练数据的features不做缓存,使每个epoch的正例、负例不一样 datas = processor.get_train() # Load data features from cache or dataset file logger.info("Creating features from dataset file at %s", args.data_dir) features = [] for data in datas: feature = convert_examples_to_features( data, tokenizer, label_list=[1], output_mode="classification", max_length=args.max_seq_length, pad_on_left=bool( args.model_type in ["xlnet"]), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0], pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, ) features.append(feature) # Convert to Tensors and build dataset original_input_ids = torch.tensor([f.input_ids for f in features[0]], dtype=torch.long) original_attention_mask = torch.tensor( [f.attention_mask for f in features[0]], dtype=torch.long) original_token_type_ids = torch.tensor( [f.token_type_ids for f in features[0]], dtype=torch.long) pos_input_ids = torch.tensor([f.input_ids for f in features[1]], dtype=torch.long) pos_attention_mask = torch.tensor([f.attention_mask for f in features[1]], dtype=torch.long) pos_token_type_ids = torch.tensor([f.token_type_ids for f in features[1]], dtype=torch.long) neg_input_ids = torch.tensor([f.input_ids for f in features[2]], dtype=torch.long) neg_attention_mask = torch.tensor([f.attention_mask for f in features[2]], dtype=torch.long) neg_token_type_ids = torch.tensor([f.token_type_ids for f in features[2]], dtype=torch.long) dataset = TensorDataset(original_input_ids, original_attention_mask, original_token_type_ids, pos_input_ids, pos_attention_mask, pos_token_type_ids, neg_input_ids, neg_attention_mask, neg_token_type_ids) return dataset
def load_and_cache_examples(args, task, tokenizer, evaluate=False): processor = glue_processors[task]() output_mode = glue_output_modes[task] # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, "cached_{}_{}_{}_{}".format( "dev" if evaluate else "train", args.model_type, str(args.max_seq_length), str(task), ), ) if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() if task in ["mnli", "mnli-mm"] and args.model_type in ["roberta", "xlmroberta"]: # HACK(label indices are swapped in RoBERTa pretrained model) label_list[1], label_list[2] = label_list[2], label_list[1] examples = ( processor.get_dev_examples(os.path.join(args.data_dir, task)) if evaluate else processor.get_train_examples( os.path.join(args.data_dir, task)) ) features = convert_examples_to_features( examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode, pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, ) logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) if output_mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif output_mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset
def get_bert_embedding(examples): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased', return_dict=True) label_list = ['positive', 'negative'] features1 = convert_examples_to_features( examples, tokenizer, max_length=256, label_list=label_list, output_mode='classification', ) all_input_ids1 = torch.tensor([f.input_ids for f in features1], dtype=torch.long) all_attention_mask1 = torch.tensor([f.attention_mask for f in features1], dtype=torch.long) all_token_type_ids1 = torch.tensor([f.token_type_ids for f in features1], dtype=torch.long) all_labels1 = torch.tensor([f.label for f in features1], dtype=torch.long) books_dataset = TensorDataset(all_input_ids1, all_attention_mask1, all_token_type_ids1, all_labels1) train_sampler = SequentialSampler(books_dataset) train_dataloader = DataLoader(books_dataset, sampler=train_sampler, batch_size=16) epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=False) book = [] label = [] for step, batch in enumerate(epoch_iterator): model.to(device) # model.train() batch = tuple(t.to(device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "labels": batch[3] } label.append(inputs["labels"]) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2] } # print(inputs) outputs = model(**inputs) # print(outputs.pooler_output.shape) book.append(outputs.pooler_output.detach().cpu()) return book, label
def load_and_cache_examples(args, task, tokenizer, evaluate=False): if not xm.is_master_ordinal(): xm.rendezvous("load_and_cache_examples") processor = processors[task]() output_mode = output_modes[task] cached_features_file = os.path.join( args.cache_dir, "cached_{}_{}_{}_{}".format( "dev" if evaluate else "train", list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length), str(task), ), ) # Load data features from cache or dataset file if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() if task in ["mnli", "mnli-mm"] and args.model_type in ["roberta"]: # HACK(label indices are swapped in RoBERTa pretrained model) label_list[1], label_list[2] = label_list[2], label_list[1] examples = ( processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) ) features = convert_examples_to_features( examples, tokenizer, max_length=args.max_seq_length, label_list=label_list, output_mode=output_mode, ) logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if xm.is_master_ordinal(): xm.rendezvous("load_and_cache_examples") # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) if output_mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif output_mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset
def load_and_cache_examples(base_data_dir: str, config, model_type, max_seq_length, evaluate=False): model_type = model_type.lower() task = config["task"].lower() data_dir = f"{base_data_dir}/{task.upper()}" config_class, model_class, tokenizer_class = constants.MODEL_CLASSES[ model_type] tokenizer = tokenizer_class.from_pretrained(config["model_name_or_path"], do_lower_case=True, cache_dir=None) processor = processors[task]() output_mode = output_modes[task] print("Creating features from dataset file at %s", data_dir) label_list = processor.get_labels() if task in ["mnli", "mnli-mm"] and model_type in ["roberta"]: # HACK(label indices are swapped in RoBERTa pretrained model) label_list[1], label_list[2] = label_list[2], label_list[1] examples = (processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir)) features = convert_examples_to_features( examples, tokenizer, label_list=label_list, max_length=max_seq_length, output_mode=output_mode, pad_on_left=bool(model_type in ["xlnet"]), pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if model_type in ["xlnet"] else 0, ) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_labels = torch.tensor([f.label for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset
def load_and_cache_examples(args, task, tokenizer, file_path): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache processor = processors[task]() output_mode = output_modes[task] # cached_features_file = os.path.join( # args.data_dir, # "cached_{}_{}_{}".format( # file_path, # str(args.max_seq_length), # str(task), # ), # ) # Load data features from cache or dataset file # if os.path.exists(cached_features_file) and not args.overwrite_cache: # logger.info("Loading features from cached file %s", cached_features_file) # features = torch.load(cached_features_file) # else: logger.info("Loading from dataset file at %s", file_path) label_list = processor.get_labels() examples = (processor.get_examples(file_path)) logger.info("Encoding features from dataset file at %s", file_path) # if args.local_rank in [-1, 0]: # logger.info("Saving features into cached file %s", cached_features_file) # torch.save(features, cached_features_file) features = convert_examples_to_features( examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode, ) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) if args.model_type != "roberta": all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_labels = torch.tensor([f.label for f in features], dtype=torch.long) if args.model_type == "roberta": dataset = TensorDataset(all_input_ids, all_attention_mask, all_labels) else: dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset
def load_and_cache_examples(args, task, tokenizer, evaluate=False): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache processor = processors[task](language=args.language, train_language=args.train_language) output_mode = output_modes[task] # Load data features from cache or dataset file cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}_{}'.format( 'example_new' if evaluate else 'train', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task), str(args.train_language if (not evaluate and args.train_language is not None) else args.language))) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() examples = processor.get_examples(args.data_dir) features = convert_examples_to_features(examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode, pad_on_left=False, pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=0, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) if output_mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) else: raise ValueError('No other `output_mode` for XNLI.') dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset
def load_and_cache_examples(args, tokenizer, evaluate=False): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache processor = PodcastProcessor(query_field=args.query_field) output_mode = "classification" # Load data features from cache or dataset file cached_features_file = os.path.join( args.output_dir, "cached_{}_{}_{}_{}".format( "test" if evaluate else "train", list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length), str(args.query_field) ) ) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() examples = ( processor.get_test_examples(args.train_test_file, args.query_file, args.qrel_file) if evaluate else processor.get_train_examples(args.train_test_file, args.query_file, args.qrel_file) ) features = convert_examples_to_features( examples, tokenizer, max_length=args.max_seq_length, label_list=label_list, output_mode=output_mode, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) if output_mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) else: raise ValueError("No other `output_mode` for XNLI.") dataset = TensorDataset(all_input_ids, all_attention_mask, all_labels) return dataset
def load_and_cache_examples(args, tokenizer, evaluate=False): processor = FAQProcessor() cached_features_file = "cached_{}_bert".format("dev" if evaluate else 'train') if os.path.exists(cached_features_file): features = torch.load(cached_features_file) else: label_list = processor.get_labels() examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) # print(len(examples)) features = convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_length=args.max_seq_length, label_list=label_list, output_mode='classification', pad_on_left=False, pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=0) logger.info('saving features into cached file %s', cached_features_file) torch.save(features, cached_features_file) ''' InputExample: self.guid = guid self.text_a = text_a self.text_b = text_b self.label = label InputFeatures: self.input_ids = input_ids self.attention_mask = attention_mask self.token_type_ids = token_type_ids self.label = label features.append( InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label)) ''' ## convert tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features],dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_label = torch.tensor([f.label for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_label) return dataset
def get_dataset(self,inputs): examples=self.processor.get_dev_examples(inputs) features=convert_examples_to_features( examples, self.tokenizer, label_list=self.label_list, max_length=self.max_seq_length, output_mode=self.output_mode, pad_on_left=False, pad_token=self.tokenizer.convert_tokens_to_ids([self.tokenizer.pad_token])[0], pad_token_segment_id=0, ) all_input_ids=torch.tensor([f.input_ids for f in features],dtype=torch.long) all_attention_mask=torch.tensor([f.attention_mask for f in features],dtype=torch.long) all_token_type_ids=torch.tensor([f.token_type_ids for f in features],dtype=torch.long) all_labels=torch.tensor([f.label for f in features],dtype=(torch.long if self.output_mode=='classification' else torch.float)) dataset=TensorDataset(all_input_ids,all_attention_mask,all_token_type_ids,all_labels) return dataset
def load_examples(args, task, tokenizer): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache processor = FAQProcessor() # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, "cached_{}_{}_{}".format( list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length), str(task), ), ) logger.info("Creating features from dataset file at %s", args.data_dir) examples = (processor.get_candidates(args.data_dir)) features = convert_examples_to_features( examples, tokenizer, label_list=[1], output_mode="classification", max_length=args.max_seq_length, pad_on_left=bool( args.model_type in ["xlnet"]), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, ) if args.local_rank == 0 and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids) return dataset, processor.candidate_title, processor.candidate_reply
def load_and_cache_examples(args, task, tokenizer, evaluate=False): processor = processors[task]() output_mode = output_modes[task] logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']: label_list[1], label_list[2] = label_list[2], label_list[1] examples = processor.get_dev_examples( args.data_dir) if evaluate else processor.get_train_examples( args.data_dir) if not evaluate and args.data_aug: examples_aug = processor.get_train_examples_aug(args.data_dir) examples = examples + examples_aug features = convert_examples_to_features( examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode, pad_on_left=bool( args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, ) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) if output_mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif output_mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset
def _make_data_loader(examples, label_list, tokenizer, batch_size, max_length, shuffle): features = convert_examples_to_features(examples, tokenizer, label_list=label_list, max_length=max_length, output_mode="classification") all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_labels = torch.tensor([f.label for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
def load_and_cache_examples(args, tokenizer, evaluate=False): processor = PairProcessor(args.window_size) output_mode = 'classification' cached_features_file = os.path.join( args.data_dir, 'cached_{}_{}_{}_{}'.format('dev' if evaluate else 'train', 'bert', str(args.max_seq_length), 'pair_order')) if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() examples = processor.get_dev_examples( args.data_dir) if evaluate else processor.get_train_examples( args.data_dir) features = convert_examples_to_features( examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode, pad_on_left=False, pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0], pad_token_segment_id=0, ) logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_labels = torch.tensor([f.label for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset
def load_and_cache_examples(args, task, tokenizer, evaluate=False): processor = data_utils.ImdbProcessor() output_mode = 'classification' # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, "cached_{}_{}_{}_{}".format( "dev" if evaluate else "train", list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length), str(task), ), ) if os.path.exists(cached_features_file) and not args.overwrite_cache: features = torch.load(cached_features_file) else: label_list = processor.get_labels() # get only first 10 examples examples = (processor.get_dev_examples(args.data_dir)[:10]) features = convert_examples_to_features( examples, tokenizer, max_length=args.max_seq_length, label_list=label_list, output_mode=output_mode, ) if args.local_rank in [-1, 0]: torch.save(features, cached_features_file) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_labels = torch.tensor([f.label for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset
def load_data(data_dir, processor, tokenizer, evaluate=False): examples = processor.get_dev_examples( data_dir) if evaluate else processor.get_train_examples(data_dir) features = convert_examples_to_features(examples, tokenizer, max_length=128, label_list=processor.get_labels(), output_mode='classification') all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_labels = torch.tensor([f.label for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset
def load_and_cache_examples(args, task, tokenizer, evaluate=False): processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format( 'dev' if evaluate else 'train', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task))) if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples( args.data_dir) features = convert_examples_to_features(examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode, pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, ) logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) if output_mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif output_mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset
def load_and_cache_examples(tokenizer, mode): processor = GLUECoSNLIProcessor(language='en', train_language=None) output_mode = output_modes["xnli"] print(f"Creating features from dataset file at {DATA_PATH}") label_list = processor.get_labels() if mode == "train": examples = processor.get_train_examples(DATA_PATH) elif mode == "valid": examples = processor.get_valid_examples(DATA_PATH) elif mode == "test": examples = processor.get_test_examples(DATA_PATH) TEST_SZ = len(examples) while len(examples) % torch.cuda.device_count() != 0: examples.append(examples[0]) else: assert (False) features = convert_examples_to_features( examples, tokenizer, max_length=MAX_SEQ_LEN, label_list=label_list, output_mode=output_mode, ) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) # all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) if output_mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) else: raise ValueError("No other `output_mode` for XNLI.") # dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) dataset = TensorDataset(all_input_ids, all_attention_mask, all_labels) return dataset
def load_examples(args, examples, tokenizer, evaluate=False): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache processor = processors[args.task_name]() output_mode = output_modes[args.task_name] label_list = processor.get_labels() features = convert_examples_to_features( examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode, pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, ) if args.local_rank == 0 and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) if output_mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif output_mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset
def from_examples2dataset(args, examples, tokenizer): features = convert_examples_to_features( examples, tokenizer, label_list=[1], output_mode="classification", max_length=args.max_seq_length, pad_on_left=bool( args.model_type in ["xlnet"]), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, ) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_attention_masks, all_token_type_ids) return dataset
def _load_examples(self, tokenizer, evaluate): if evaluate: examples = self.processor.get_dev_examples(self.params["data_dir"]) else: examples = self.processor.get_train_examples( self.params["data_dir"]) if self.params["n"] >= 0: examples = sample(examples, self.params["n"]) features = convert_examples_to_features( examples, tokenizer, label_list=self.label_list, max_length=self.params["max_seq_length"], output_mode=self.output_mode, pad_on_left=False, pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0], pad_token_segment_id=0, ) return self._convert_to_tensors(features)
def get_features(text, tokenizer, label_list=['0', '1']): ''' parms: label_list : list of labels in strings ['0','1'], default set for sentiment tokenzier examples: list of single example for inference ''' ex = AttrDict() ex.update({ "guid": "train-1", "label": "0", "text_a": text, "text_b": None }) examples = [ex] features = convert_examples_to_features( examples, tokenizer, label_list=label_list, max_length=128, output_mode="classification", pad_on_left=False, # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=0, ) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_labels = torch.tensor([f.label for f in features], dtype=torch.long) eval_dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=1) return eval_dataloader
def load_test_dataset(args, tokenizer): processor = processors[args.task_name]() label_list = processor.get_labels() examples = processor.get_test_examples(args.data_dir) features = convert_examples_to_features( examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_modes[args.task_name], pad_on_left=False, pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=0 ) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_labels = torch.tensor([f.label for f in features],dtype=torch.long) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset