def load_and_cache_examples(args, task, tokenizer, evaluate=False, TEST=False): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file if TEST: cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format( 'test', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task))) else: cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format( 'dev' if evaluate else 'train', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task))) if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']: # HACK(label indices are swapped in RoBERTa pretrained model) label_list[1], label_list[2] = label_list[2], label_list[1] if TEST: examples = processor.get_test_examples(args.data_dir) else: examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end=bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0, sep_token=tokenizer.sep_token, sep_token_extra=bool(args.model_type in ['roberta']), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset
def get_training_dataset(self, train_batch_size, dataset_folder="/home/phillab/glue_data/CoLA"): cola_proc = utils_glue.ColaProcessor() label_list = cola_proc.get_labels() max_seq_length = 400 examples = cola_proc.get_train_examples(dataset_folder) features = utils_glue.convert_examples_to_features( examples, label_list, max_seq_length, self.tokenizer, "classification") all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) dataloader = DataLoader(dataset=dataset, batch_size=train_batch_size, sampler=RandomSampler(dataset), drop_last=True) return dataloader
def load_examples(contents, max_seq_length, tokenizer, label_list): """ :param contents: eg: [('苹果很好用', '苹果')] :param max_seq_length: :param tokenizer: 初始化后的tokenizer :param label_list: :return: """ examples = [] for guid, content in enumerate(contents): sentence, aspect = content examples.append(InputExample(guid=guid, text_a=sentence, text_b=aspect)) features = convert_examples_to_features(examples, label_list, max_seq_length, tokenizer, output_mode="classification", cls_token_segment_id=0, pad_token_segment_id=0) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids) return dataset
def load_and_cache_examples(args, task, tokenizer, evaluate=False): processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, 'cached_{}_{}_{}_{}'.format( 'dev' if evaluate else 'train', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task))) if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() examples = processor.get_dev_examples( args.data_dir) if evaluate else processor.get_train_examples( args.data_dir) features = convert_examples_to_features( examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end=False, # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=0, sep_token=tokenizer.sep_token, sep_token_extra= False, # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=False, # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0], pad_token_segment_id=0, ) logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset
def load_and_cache_examples(args, task, tokenizer, evaluate=False, is_aux=False): """ :param args: :param task: eg : 任务名称 :param tokenizer: 加载好的tokenizer :param evaluate: bool,是否是评估数据集还是训练集 True: 代表加载dev.tsv, :param is_aux: bool :return: """ if is_aux: data_dir = args.aux_data_dir else: data_dir = args.data_dir processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file # 如果需要在数据处理阶段就进行数据处理的截取,那么需要传入数据的最大长度 processor.max_seq_length = args.max_seq_length cached_features_file = os.path.join( data_dir, 'cached_{}_{}_{}'.format('dev' if evaluate else 'train', str(args.max_seq_length), str(task))) if os.path.exists(cached_features_file): logger.info("从缓存加载features %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("没有发现缓存features,根据datafile生成features %s", data_dir) label_list = processor.get_labels() examples = processor.get_dev_examples( data_dir) if evaluate else processor.get_train_examples(data_dir) features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_segment_id=0, pad_token_segment_id=0) if args.local_rank in [-1, 0]: logger.info("保存feature到缓存文件 %s", cached_features_file) torch.save(features, cached_features_file) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset
def load_and_cache_examples(args, task, tokenizer, evaluate=False): processor = processors[task]() output_mode = output_modes[task] cached_features_file = os.path.join( args.data_dir, 'cached_{}'.format('dev' if evaluate else 'train')) if os.path.exists(cached_features_file) and not args.overwrite_cache: # logger.info('Loading features from cached file {}'.format(cached_features_file)) features = torch.load(cached_features_file) else: # logger.info('Creating features from dataset file at {}'.format(args.data_dir)) label_list = processor.get_labels() examples = processor.get_dev_examples( args.data_dir) if evaluate else processor.get_train_examples( args.data_dir) ##this is where you double reverse if evaluate: examples_reverse = [] else: examples_reverse = [ InputExample(pid=example.pid, text_a=example.text_b, text_b=example.text_a, aug_a=example.aug_a, aug_b=example.aug_b, label=example.label) for example in examples ] features = convert_examples_to_features(examples + examples_reverse, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end=True, cls_token=tokenizer.cls_token, sep_token=tokenizer.sep_token, cls_token_segment_id=2, pad_on_left=True, pad_token_segment_id=4, augment=args.augment) # logger.info('Saving features into cached file {}'.format(cached_features_file)) torch.save(features, cached_features_file) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) all_pids = torch.tensor([int(f.pid) for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_pids) return dataset
def load_and_cache_examples(args, task, tokenizer, evaluate=False): processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, "cached_{}_{}_{}_{}".format( "dev" if evaluate else "train", list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length), str(task), ), ) if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() examples = (processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)) features = convert_examples_to_features( examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end=bool(args.model_type in ["xlnet"] ), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, sep_token=tokenizer.sep_token, cls_token_segment_id=2 if args.model_type in ["xlnet"] else 1, pad_on_left=bool( args.model_type in ["xlnet"]), # pad on the left for xlnet pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset
def load_predict_dataset(dataframe, args, task, tokenizer, evaluate=False): """ :param dataframe: dataframe containing columns like those specified in train.csv and dev.csv :param args: :param task: :param tokenizer: :param evaluate: :return: an instance of TensorDataset """ if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache processor = processors[task]() output_mode = output_modes[task] logger.info("Creating features from dataframe size={0}".format(len(dataframe))) label_list = processor.get_labels() logger.info('label_list={0}'.format(label_list)) if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']: # HACK(label indices are swapped in RoBERTa pretrained model) label_list[1], label_list[2] = label_list[2], label_list[1] examples = processor.get_test_examples(dataframe) logger.info('Examples={0}'.format(examples)) features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end=bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0, sep_token=tokenizer.sep_token, sep_token_extra=bool(args.model_type in ['roberta']), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, ) logger.info('Features={}'.format(features)) if args.local_rank == 0 and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset
def load_examples(contents, max_seq_length, tokenizer, label_list, prefix_name): """ :param contents: eg: [('苹果很好用', '苹果')] 或者 [('苹果很好用', '苹果', '积极')] :param max_seq_length: :param tokenizer: 初始化后的tokenizer :param label_list: :return: """ examples = [] for guid, content in enumerate(contents): if len(content) == 2: #表示只有sentence和aspect关键字,用于预测 sentence, aspect = content sentence = prefix_name + sentence examples.append( InputExample(guid=guid, text_a=sentence, text_b=aspect)) elif len(content) == 3: # 表示内容是sentence和aspect关键字和label,一般用于训练 sentence, aspect, label = content sentence = prefix_name + sentence examples.append( InputExample(guid=guid, text_a=sentence, text_b=aspect, label=label)) elif len(content) == 5: # 表示内容是sentence和aspect关键字和label,一般用于训练, start和end是aspect的位置信息 sentence, aspect, start, end, label = content sentence = prefix_name + sentence examples.append( InputExample(guid=guid, text_a=sentence, text_b=aspect, label=label)) else: print(f"这条数据有问题,过滤掉: {guid}: {content}") features = convert_examples_to_features(examples, label_list, max_seq_length, tokenizer, output_mode="classification", cls_token_segment_id=0, pad_token_segment_id=0) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset
def load_and_cache_examples_line_list(lines_list, args, task, tokenizer, evaluate=False): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file label_list = processor.get_labels() if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']: # HACK(label indices are swapped in RoBERTa pretrained model) label_list[1], label_list[2] = label_list[2], label_list[1] examples = processor.get_predict_examples(lines_list=lines_list) # examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) features = convert_examples_to_features( examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end=bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0, sep_token=tokenizer.sep_token, sep_token_extra=bool(args.model_type in ['roberta']), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=bool( args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, ) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset
def load_and_cache_examples(args, task, tokenizer, evaluate=False, is_aux=False): if is_aux: data_dir = args.aux_data_dir else: data_dir = args.data_dir processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file cached_features_file = os.path.join( data_dir, 'cached_{}_{}_{}'.format('dev' if evaluate else 'train', str(args.max_seq_length), str(task))) if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", data_dir) label_list = processor.get_labels() examples = processor.get_dev_examples( data_dir) if evaluate else processor.get_train_examples(data_dir) features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_segment_id=0, pad_token_segment_id=0) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset
def load_and_cache_examples(args, task, tokenizer, evaluate=False): def transform_examples_to_hr(exmpls): examples_hr = [ '[CLS] ' + exp.text_a + ' [SEP] ' + exp.text_b + ' [LABEL] ' + exp.label for exp in exmpls ] return examples_hr processor = processors[task]() output_mode = output_modes[task] examples = None tokenized_examples = None logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() examples = processor.get_dev_examples( args.data_dir) if evaluate else processor.get_train_examples( args.data_dir) features, tokenized_examples = convert_examples_to_features( examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end=bool( args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, sep_token=tokenizer.sep_token, cls_token_segment_id=2 if args.model_type in ['xlnet'] else 1, pad_on_left=bool( args.model_type in ['xlnet']), # pad on the left for xlnet pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset, transform_examples_to_hr(examples)
def load_and_cache_examples(args, task, tokenizer, evaluate=False): if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache processor = None if(args.task_name == "multilabel"): processor = processors[task](args.data_dir) else: processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format( 'dev' if evaluate else 'train', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task))) if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end=bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, sep_token=tokenizer.sep_token, cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0, pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.float) elif output_mode == "multi-classification": all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset
def load_examples(processor, list_examples, tokenizer, LABELS=False): output_mode = "classification" # Load data features from the list of provided examples label_list = processor.get_labels() examples = processor.get_examples(list_examples, "test", LABELS) max_seq_length = 128 model_type = "bert" features = convert_examples_to_features( examples, label_list, max_seq_length, tokenizer, output_mode, cls_token_at_end=bool( model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=2 if model_type in ['xlnet'] else 0, sep_token=tokenizer.sep_token, sep_token_extra=bool( model_type in ['roberta'] ), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=bool(model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if model_type in ['xlnet'] else 0, ) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) if LABELS: if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) else: dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids) return dataset
def load_and_cache_pred_examples(args, task, tokenizer): processor = processors[task]() output_mode = output_modes[task] logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']: # HACK(label indices are swapped in RoBERTa pretrained model) label_list[1], label_list[2] = label_list[2], label_list[1] examples = processor.get_predict_samples(args.data_dir) features = convert_examples_to_features( examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end=bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0, sep_token=tokenizer.sep_token, sep_token_extra=bool(args.model_type in ['roberta']), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=bool( args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, ) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset
def load_and_cache_examples(args, task, tokenizer, input1, input2, evaluate=False): processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file label_list = processor.get_labels() examples = processor._create_example_for_test(input1, input2) features = convert_examples_to_features( examples, label_list, args["max_seq_length"], tokenizer, output_mode, cls_token_at_end=False, cls_token=tokenizer.cls_token, cls_token_segment_id=0, sep_token=tokenizer.sep_token, sep_token_extra=False, pad_on_left=False, pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=0, ) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) return all_input_ids, all_input_mask, all_segment_ids
def load_and_cache_examples(args, task, tokenizer, set_type='train'): processor = processors[task]() output_mode = output_modes[task] is_multi_choice = True if output_mode == 'multi-choice' else False # Load data features from cache or dataset file cached_features_file = os.path.join(args.data_dir[task], 'cached_{}_{}_{}_{}'.format( set_type, list(filter(None, args.model_name_or_path.split('/'))).pop(), str(MAX_SEQ_LENGTHS[task]), str(task))) if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir[task]) label_list = processor.get_labels() if set_type == 'train': examples = processor.get_train_examples(args.data_dir[task]) elif set_type == 'dev': examples = processor.get_dev_examples(args.data_dir[task]) else: examples = processor.get_test_examples(args.data_dir[task]) features = convert_examples_to_features(examples, label_list, MAX_SEQ_LENGTHS[task], tokenizer, output_mode, cls_token_at_end=bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, sep_token=tokenizer.sep_token, cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0, pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, do_lower_case=args.do_lower_case, is_multi_choice=is_multi_choice) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) # Convert to Tensors and build dataset dataset = convert_features_to_tensors(features, output_mode, is_multi_choice=is_multi_choice) return dataset
def load_and_cache_examples(args, task, tokenizer, evaluate=False): processor = processors[task]() output_mode = output_modes[task] data_dir = args.data_dir logger.info("Creating features from dataset file at %s", data_dir) label_list = processor.get_labels() examples = processor.get_dev_examples( data_dir) if evaluate else processor.get_train_examples(data_dir) features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end=False, cls_token=tokenizer.cls_token, sep_token=tokenizer.sep_token, cls_token_segment_id=1, pad_on_left=False, pad_token_segment_id=0) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset
def load_and_cache_examples(args, task, tokenizer, type): processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, 'cached_{}_{}_{}_{}'.format( type, list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task))) if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() docs = defaultdict(dict) reader = jsonlines.Reader( open(os.path.join(args.data_dir, 'docs.jsonl'))) for line in reader: docs[line["docid"]] = line["document"] if type == 'train': examples = processor.get_train_examples(args.data_dir, docs) elif type == 'dev': examples = processor.get_dev_examples(args.data_dir, docs) else: examples = processor.get_test_examples(args.data_dir, docs) features = convert_examples_to_features( examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end=bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0, sep_token=tokenizer.sep_token, sep_token_extra=bool(args.model_type in ['roberta']), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0], pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, ) logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset
def load_and_cache_examples(args, task, tokenizer, evaluate=False): def transform_examples_to_hr(exmpls): examples_hr = [ '[CLS] ' + exp.text_a + ' [SEP] ' + exp.text_b + ' [LABEL] ' + exp.label for exp in exmpls ] return examples_hr processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, 'cached_{}_{}_{}_{}'.format( 'dev' if evaluate else 'train', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task))) examples = None tokenized_examples = None if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) # tmp code for writing out the textual error analysis; comment for training! label_list = processor.get_labels() examples = processor.get_dev_examples( args.data_dir) if evaluate else processor.get_train_examples( args.data_dir) features, tokenized_examples = convert_examples_to_features( examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end=bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, sep_token=tokenizer.sep_token, cls_token_segment_id=2 if args.model_type in ['xlnet'] else 1, pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0) # end tmp # features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() examples = processor.get_dev_examples( args.data_dir) if evaluate else processor.get_train_examples( args.data_dir) features, tokenized_examples = convert_examples_to_features( examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end=bool(args.model_type in ['xlnet'] ), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, sep_token=tokenizer.sep_token, cls_token_segment_id=2 if args.model_type in ['xlnet'] else 1, pad_on_left=bool( args.model_type in ['xlnet']), # pad on the left for xlnet pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset, transform_examples_to_hr( examples ) # tokenized_examples, examples, all_label_ids, # transform_examples_to_hr(examples)
def load_and_cache_examples(args, task, tokenizer, evaluate=False, dev_evaluate=False): data_dir = task_to_data_dir[task] if task.startswith("fever"): processor = processors["fever"]() elif task in nli_task_names: processor = processors["nli"](data_dir) elif task in ["mnli"]: processor = processors["mnli"](hans=args.hans) elif task == "mnli-mm": processor = processors["mnli-mm"](hans=args.hans) elif task.startswith("HANS"): processor = processors["hans"](hans=args.hans) else: processor = processors[task]() # Load data features from cache or dataset file cached_features_file = os.path.join( data_dir, 'cached_{}_{}_{}_{}'.format( 'dev' if evaluate else 'train', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task))) print("File is: ", cached_features_file) if False: #os.path.exists(cached_features_file) and args.use_cached_dataset: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", data_dir) label_list = processor.get_labels() if dev_evaluate: # and task in nli_task_names: examples = processor.get_validation_dev_examples(data_dir) else: examples = processor.get_dev_examples(data_dir) if evaluate else\ processor.get_train_examples(data_dir) features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, "classification", cls_token_at_end=bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, sep_token=tokenizer.sep_token, cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0, pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, rubi=args.rubi or args.hypothesis_only or args.focal_loss or args.poe_loss or args.hans_only, rubi_text=args.rubi_text, hans=(args.hans and not evaluate) or args.hans_only,\ hans_features=args.hans_features) logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) if (args.hans and not evaluate) or args.hans_only: all_h_ids = torch.tensor([f.h_ids for f in features], dtype=torch.long) all_h_masks = torch.tensor([f.input_mask_h for f in features], dtype=torch.long) all_p_ids = torch.tensor([f.p_ids for f in features], dtype=torch.long) all_p_masks = torch.tensor([f.input_mask_p for f in features], dtype=torch.long) all_have_overlap = torch.tensor([f.have_overlap for f in features], dtype=torch.float) all_overlap_rate = torch.tensor([f.overlap_rate for f in features], dtype=torch.float) all_subsequence = torch.tensor([f.subsequence for f in features], dtype=torch.float) all_constituent = torch.tensor([f.constituent for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids,\ all_h_ids, all_h_masks, all_p_ids, all_p_masks, all_have_overlap, all_overlap_rate,\ all_subsequence, all_constituent) elif args.rubi or args.hypothesis_only or args.focal_loss or args.poe_loss: # Hypothesis representations. all_h_ids = torch.tensor([f.h_ids for f in features], dtype=torch.long) all_h_masks = torch.tensor([f.input_mask_h for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids,\ all_h_ids, all_h_masks) else: dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset, processor.get_labels(), processor.num_classes