def bert_features(model, tokenizer, data, batch_size=1): in_features = convert_examples_to_features(data, seq_length=50, tokenizer=tokenizer) all_input_ids = torch.tensor([f.input_ids for f in in_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in in_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=batch_size) model.eval() bert = [] for input_ids, input_mask, example_indices in tqdm(eval_dataloader): input_ids = input_ids.to(device) input_mask = input_mask.to(device) all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask) bert.append(all_encoder_layers[-1].detach().cpu().numpy()) return np.concatenate(bert, axis=0)
def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, "cached_{}_{}_{}".format( mode, 'bert', str(args.max_seq_length) ), ) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) examples = read_examples_from_file(args.data_dir, mode) features = convert_examples_to_features( examples, labels, args.max_seq_length, tokenizer, cls_token_at_end=bool(args.model_type in ["xlnet"]), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0, sep_token=tokenizer.sep_token, sep_token_extra=bool(args.model_type in ["roberta"]), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, pad_token_label_id=pad_token_label_id, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset # features中的label_ids 统一将word做tokenize后多余的以及pad的位置全部置为了pad_token_label_id = -100 # 后续验证的时候需要筛掉 all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset
# In[6]: tokenizer = bert_utils.create_tokenizer_from_hub_module(BERT_PATH, sess) # ### Preprocess Data # In[7]: train_text, train_label, num_classes = utils.load_ag_news_dataset( max_seq_len=MAX_SEQ_LEN, test=False) train_label = np.asarray(train_label) train_examples = bert_utils.convert_text_to_examples(train_text, train_label) feat = bert_utils.convert_examples_to_features(tokenizer, train_examples, max_seq_length=MAX_SEQ_LEN, verbose=True) (train_input_ids, train_input_masks, train_segment_ids, train_labels) = feat train_input_ids, train_input_masks, train_segment_ids, train_labels = shuffle( train_input_ids, train_input_masks, train_segment_ids, train_labels) # In[8]: examples, labels, num_classes = utils.load_ag_news_dataset( max_seq_len=MAX_SEQ_LEN, test=True) labels = np.asarray(labels) test_examples = bert_utils.convert_text_to_examples(examples, labels) feat = bert_utils.convert_examples_to_features(tokenizer, test_examples,
bert_samples = [] for i, test_sample in enumerate(test_samples): bert_sample = bert_utils.InputExample( guid="train-%d" % i, text_a=test_sample["text"], text_b=None, label=test_sample["state_label"], entity=test_sample["participant"], sequence_id=test_sample["entity_tags"].astype(int).tolist()) bert_samples.append(bert_sample) test_features = bert_utils.convert_examples_to_features( bert_samples, label_list=["none", "create", "destroy", "move"], max_seq_length=70, tokenizer=tokenizer, output_mode="classification") with torch.no_grad(): correct_state_label = 0 total_state_label = 0 sum_loss = 0.0 state_label_cm = [[0] * 4 for _ in range(4)] # fpDev = open("test_preds_epoch%d.txt" % (epoch), "w") for i, test_feature in enumerate(test_features): gpu_input_ids = test_feature.input_ids.cuda()
do_lower_case=do_lower_case) num_train_optimization_steps = int( len(train_InputExamples) / batch_size / gradient_accumulation_steps) * num_epochs model_qa = BertQA.from_pretrained( bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) if args.local_rank == 0: torch.distributed.barrier() model_qa.to(device) train_features = bert_utils.convert_examples_to_features( train_InputExamples, MAX_SEQ_LENGTH, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor([f.start_label_ids for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_label_ids for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) train_sampler = SequentialSampler( train_data) if args.local_rank == -1 else DistributedSampler(train_data)