def load_dataset(main_file, args, processor, tokenizer, output_mode, data_type = None): # Prepare data loader id_file = main_file + "_ids_nocls.pt" mask_file = main_file + "_mask_nocls.pt" label_file = main_file + "_labels.pt" #case1 tensor files exist file_exist_count = sum([isfile(id_file), isfile(label_file), isfile(mask_file)]) if 0 < file_exist_count < 3: exit("Only part of the data is saved as tensor files. Delete those files and try again.") elif file_exist_count == 3: #import pdb; pdb.set_trace() all_input_ids = torch.load(id_file) all_masks = torch.load(mask_file) all_label_ids = torch.load(label_file) return TensorDataset(all_input_ids, all_masks, all_label_ids) #load the mega object if data_type == "train": features = convert_examples_to_features( processor.get_train_examples(args.data_dir), processor.get_labels(), tokenizer, args.max_tokens) elif data_type == "test": features = convert_examples_to_features( processor.get_dev_examples(args.data_dir), processor.get_labels(), tokenizer, args.max_tokens) elif data_type == "val": features = convert_examples_to_features( processor.get_val_examples(args.data_dir), processor.get_labels(), tokenizer, args.max_tokens) else: exit(f"invalid data_type {data_type}") #parse it carefully into tensor files torch.save(torch.tensor([f.input_ids for f in features], dtype=torch.long), id_file) torch.save(torch.tensor([f.input_mask for f in features], dtype=torch.long), mask_file) torch.save(torch.tensor([f.label_id for f in features], dtype=torch.long), label_file) #call this function again! return load_dataset(main_file, args, processor, tokenizer, output_mode, data_type=data_type)
def run(data): data_loaded = json.loads(data) input_sentence = data_loaded["input"] input_example = ("PREDICT_0", input_sentence, "None") max_seq_len = configs["max_sequence_length"] feats = convert_examples_to_features([input_example], classes_list, max_seq_len, tokenizer)[0] feats = [torch.tensor(x).unsqueeze(0) for x in feats] model_out = model(input_ids=feats[0], attention_mask=feats[1], token_type_ids=feats[2], class_label_ids=None, input_ids_masked=feats[4]) logits_softmaxed = torch.nn.functional.softmax(model_out[1][0], dim=-1).detach().cpu().numpy() return {"country_prediction": str(classes_list[np.argmax(logits_softmaxed)]), "province_prediction": "None"}
def load_and_cache_examples(args, task, tokenizer, evaluate=False): processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, 'cached_{}_{}_{}_{}'.format( 'dev' if evaluate else 'train', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task))) if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() examples = processor.get_dev_examples( args.data_dir) if evaluate else processor.get_train_examples( args.data_dir) features = convert_examples_to_features( examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end=bool(args.model_type in ['xlnet'] ), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, sep_token=tokenizer.sep_token, cls_token_segment_id=2 if args.model_type in ['xlnet'] else 1, pad_on_left=bool( args.model_type in ['xlnet']), # pad on the left for xlnet pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset
def load_and_cache_examples(args, folder, task, tokenizer): processor = processors[task]() output_mode = output_modes[task] cached_features_file = os.path.join( folder, 'cached_{}_{}_{}_{}_{}'.format( 'test', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task), str(args.similarity_threshold))) if os.path.exists(cached_features_file): #if os.path.exists(cached_features_file): features = torch.load(cached_features_file) else: label_list = processor.get_labels() examples = torch.load(folder + '/example') features = convert_examples_to_features( examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end=bool(args.model_type in ['xlnet'] ), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, sep_token=tokenizer.sep_token, cls_token_segment_id=2 if args.model_type in ['xlnet'] else 1, pad_on_left=bool( args.model_type in ['xlnet']), # pad on the left for xlnet pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0) if args.local_rank in [-1, 0]: #logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset
def return_logits(in_example): input_example = ("PREDICT_0", in_example, "None") max_seq_len = configs["max_sequence_length"] feats = convert_examples_to_features([input_example], classes_list, max_seq_len, tokenizer)[0] feats = [torch.tensor(x).unsqueeze(0) for x in feats] model_out = model(input_ids=feats[0], attention_mask=feats[1], token_type_ids=feats[2], class_label_ids=None, input_ids_masked=feats[4]) logits_softmaxed = torch.nn.functional.softmax( model_out[1][0], dim=-1).detach().cpu().numpy() return logits_softmaxed
def load_and_cache_examples_randomized(args, task, tokenizer, random_smooth, epoch, evaluate=False): processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, 'cached_{}_{}_{}_{}_{}'.format( 'dev' if evaluate else 'train', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task), str(args.similarity_threshold))) ############################################################################## if os.path.exists(cached_features_file + '_' + str(epoch)): ############################################################################## print('Randomize dataset: cached features exists') logger.info("Loading features from cached file %s", cached_features_file + '_' + str(epoch)) features = torch.load(cached_features_file + '_' + str(epoch)) else: print('Randomize dataset: cached features NOT exists') if os.path.exists(cached_features_file + '_example'): print('Randomize dataset: cached examples exists') examples = torch.load(cached_features_file + '_example') else: print('Randomize dataset: cached examples NOT exists') logger.info("Creating features from dataset file at %s", args.data_dir) examples = processor.get_dev_examples( args.data_dir) if evaluate else processor.get_train_examples( args.data_dir) # save examples before perturbe it logger.info("Saving examples into cached file %s", cached_features_file + '_example') torch.save(examples, cached_features_file + '_example') for example in examples: if example.text_a: example.text_a = str( random_smooth.get_perturbed_batch( np.array([[example.text_a]]))[0][0]) if example.text_b: example.text_b = str( random_smooth.get_perturbed_batch( np.array([[example.text_b]]))[0][0]) label_list = processor.get_labels() features = convert_examples_to_features( examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end=bool(args.model_type in ['xlnet'] ), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, sep_token=tokenizer.sep_token, cls_token_segment_id=2 if args.model_type in ['xlnet'] else 1, pad_on_left=bool( args.model_type in ['xlnet']), # pad on the left for xlnet pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0) if args.local_rank in [-1, 0]: #logger.info("Saving examples into cached file %s", cached_features_file + '_example') #torch.save(examples, cached_features_file + '_example') logger.info("Saving features into cached file %s", cached_features_file + '_' + str(epoch)) torch.save(features, cached_features_file + '_' + str(epoch)) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset