with open(label_file) as f: return [line.strip() for line in f] return ['B-GPE', 'I-GPE', 'O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-TTL', 'I-TTL', 'I-FAC', 'B-FAC', 'B-VEH', 'I-VEH', 'B-WEA', 'I-WEA'] num_labels = len(get_labels()) tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False, do_basic_tokenize=False) model_dir = '../KBP_19_bert_ner_5e-5' output_model_file = os.path.join(model_dir, WEIGHTS_NAME) output_config_file = os.path.join(model_dir, CONFIG_NAME) config = BertConfig(output_config_file) model = BertForTokenClassification(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) def pred_ner(sent): eval_examples = read_sent(sent) label_list = get_labels() eval_features = convert_examples_to_features( eval_examples, label_list, 300, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_label_masks = torch.tensor([f.label_mask for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_label_masks) # Run prediction for full data
class NERPredictor: def __init__(self, model_dir, batch_size, epoch, max_seq_length=128, local_rank=-1, no_cuda=False): self._batch_size = batch_size self._local_rank = local_rank self._max_seq_length = max_seq_length self._device, self._n_gpu = get_device(no_cuda=no_cuda) self._model_config = json.load( open(os.path.join(model_dir, "model_config.json"), "r")) self._label_to_id = self._model_config['label_map'] self._label_map = { v: k for k, v in self._model_config['label_map'].items() } self._bert_tokenizer = \ BertTokenizer.from_pretrained(model_dir, do_lower_case=self._model_config['do_lower']) output_config_file = os.path.join(model_dir, CONFIG_NAME) output_model_file = os.path.join( model_dir, "pytorch_model_ep{}.bin".format(epoch)) config = BertConfig(output_config_file) self._model = BertForTokenClassification(config, num_labels=len( self._label_map)) self._model.load_state_dict( torch.load(output_model_file, map_location=lambda storage, loc: storage if no_cuda else None)) self._model.to(self._device) self._model.eval() return def classify_text(self, sentences): examples = NerProcessor.create_examples(sentences, 'test') features = [ fe for ex in examples for fe in convert_examples_to_features( ex, self._label_to_id, self._max_seq_length, self._bert_tokenizer) ] data_loader = NerProcessor.make_data_loader(None, self._batch_size, self._local_rank, self._label_to_id, self._max_seq_length, self._bert_tokenizer, features=features, sequential=True) prediction_tmp = model_predict(data_loader, self._device, self._label_map, self._model) assert len(prediction_tmp) == len(features) prediction = [] prev_guid = None for fe, pr in zip(features, prediction_tmp): # longer sentences might have been processed in several steps # therefore we have to glue them together. This can be done on the basis of the guid. if prev_guid != fe.guid: prediction.append((fe.tokens[1:-1], pr)) else: prediction[-1] = (prediction[-1][0] + fe.tokens[1:-1], prediction[-1][1] + pr) prev_guid = fe.guid try: assert len(sentences) == len(prediction) except AssertionError: print('Sentences:\n') print(sentences) print('\n\nPrediciton:\n') print(prediction) return prediction
def train_and_evaluate(OUTPUT_DIR, do_train=True, do_eval=True): """ Train and evaluate a BERT NER Model""" BATCH_SIZE = 32 LEARNING_RATE = 2e-5 NUM_TRAIN_EPOCHS = 5.0 #in this steps lr will be low and training will be slow WARMUP_PROPORTION = 0.1 if os.path.exists(OUTPUT_DIR) and os.listdir(OUTPUT_DIR) and do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( OUTPUT_DIR)) if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True) if do_train: train_examples, num_train_examples = create_datasets("AGE/train.txt") num_train_steps = int( math.ceil(num_train_examples / BATCH_SIZE * NUM_TRAIN_EPOCHS)) num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION) model = BertForTokenClassification.from_pretrained( "bert-base-uncased", num_labels=num_labels) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=LEARNING_RATE, warmup=WARMUP_PROPORTION, t_total=num_train_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 train_features = convert_examples_to_features(train_examples, label_list, MAX_SEQ_LENGTH, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", num_train_examples) logger.info(" Batch size = %d", BATCH_SIZE) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE) model.train() # for name, param in model.named_parameters(): # if param.requires_grad: # print(name) # return for _ in trange(int(NUM_TRAIN_EPOCHS), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 optimizer.step() optimizer.zero_grad() global_step += 1 print(tr_loss) # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(OUTPUT_DIR, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(OUTPUT_DIR, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) label_map = {i: label for i, label in enumerate(label_list, 1)} model_config = { "bert_model": "bert-base-uncased", "do_lower": True, "max_seq_length": MAX_SEQ_LENGTH, "num_labels": len(label_list) + 1, "label_map": label_map } json.dump(model_config, open(os.path.join(OUTPUT_DIR, "model_config.json"), "w")) else: output_config_file = os.path.join(OUTPUT_DIR, CONFIG_NAME) output_model_file = os.path.join(OUTPUT_DIR, WEIGHTS_NAME) config = BertConfig(output_config_file) model = BertForTokenClassification(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) model.to(device) if do_eval: EVAL_BATCH_SIZE = 32 eval_examples, num_eval_examples = create_datasets("AGE/valid.txt") eval_features = convert_examples_to_features(eval_examples, label_list, MAX_SEQ_LENGTH, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", num_eval_examples) logger.info(" Batch size = %d", EVAL_BATCH_SIZE) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=EVAL_BATCH_SIZE) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 y_true = [] y_pred = [] label_map = {i: label for i, label in enumerate(label_list, 1)} for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask) logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() input_mask = input_mask.to('cpu').numpy() for i, mask in enumerate(input_mask): temp_1 = [] temp_2 = [] for j, m in enumerate(mask): if j == 0: continue if m: if label_map[label_ids[i][j]] != "X": temp_1.append(label_map[label_ids[i][j]]) temp_2.append(label_map[logits[i][j]]) else: temp_1.pop() temp_2.pop() break y_true.append(temp_1) y_pred.append(temp_2) report = classification_report(y_true, y_pred) output_eval_file = os.path.join(OUTPUT_DIR, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") logger.info("\n%s", report) writer.write(report)
def predict(OUTPUT_DIR, in_sentences): """ predict a bert model OUTPUT_DIR :: contains pretrained models in_sentences :: is a list of sentences on which tagging has to be performed """ PRED_BATCH_SIZE = 64 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_config = os.path.join(OUTPUT_DIR, "model_config.json") model_config = json.load(open(model_config)) output_config_file = os.path.join(OUTPUT_DIR, CONFIG_NAME) output_model_file = os.path.join(OUTPUT_DIR, WEIGHTS_NAME) config = BertConfig(output_config_file) model = BertForTokenClassification(config, num_labels=model_config["num_labels"]) model.load_state_dict(torch.load(output_model_file)) model.to(device) tokenizer = BertTokenizer.from_pretrained( model_config["bert_model"], do_lower_case=model_config["do_lower"]) in_examples = [ InputExample(guid="", text_a=x, text_b=None, label=["O"] * len(x.split(" "))) for x in in_sentences ] in_features = convert_examples_to_features(in_examples, label_list, MAX_SEQ_LENGTH, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in in_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in in_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in in_features], dtype=torch.long) pred_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids) # # Run prediction for full data pred_sampler = SequentialSampler(pred_data) pred_dataloader = DataLoader(pred_data, sampler=pred_sampler, batch_size=PRED_BATCH_SIZE, drop_last=False) model.eval() preds = [] label_map = model_config["label_map"] for input_ids, input_mask, segment_ids in tqdm(pred_dataloader, desc="Predicting"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask) logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) logits = logits.detach().cpu().numpy() pred_batch = [] for i, mask in enumerate(input_mask): temp_1 = [] for j, m in enumerate(mask): if j == 0: continue if m: if label_map[str(logits[i][j])] != "X": temp_1.append(label_map[str(logits[i][j])]) else: temp_1.pop() break pred_batch.append(temp_1) preds.extend(pred_batch) return [(sentence, pred) for sentence, pred in zip(in_sentences, preds)]
def model_eval(batch_size, label_map, processor, device, num_train_epochs=1, output_dir=None, model=None, local_rank=-1, no_cuda=False, dry_run=False): output_eval_file = None if output_dir is not None: output_eval_file = os.path.join(output_dir, processor.get_evaluation_file()) logger.info('Write evaluation results to: {}'.format(output_eval_file)) dataloader = processor.get_dev_examples(batch_size, local_rank) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(dataloader)) logger.info(" Batch size = %d", batch_size) results = list() output_config_file = None if output_dir is not None: output_config_file = os.path.join(output_dir, CONFIG_NAME) for ep in trange(1, int(num_train_epochs) + 1, desc="Epoch"): if dry_run and ep > 1: logger.info("Dry run. Stop.") break if output_config_file is not None: # Load a trained model and config that you have fine-tuned output_model_file = os.path.join( output_dir, "pytorch_model_ep{}.bin".format(ep)) if not os.path.exists(output_model_file): logger.info( "Stopping at epoch {} since model file is missing.".format( ep)) break config = BertConfig(output_config_file) model = BertForTokenClassification(config, num_labels=len(label_map)) model.load_state_dict( torch.load(output_model_file, map_location=lambda storage, loc: storage if no_cuda else None)) model.to(device) if model is None: raise ValueError('Model required for evaluation.') model.eval() y_pred, y_true = model_predict_compare(dataloader, device, label_map, model, dry_run) lines = [ 'empty ' + 'XXX ' + v + ' ' + p for yt, yp in zip(y_true, y_pred) for v, p in zip(yt, yp) ] res = conll_eval(lines) # print(res) evals = \ pd.concat([pd.DataFrame.from_dict(res['overall']['evals'], orient='index', columns=['ALL']), pd.DataFrame.from_dict(res['slots']['LOC']['evals'], orient='index', columns=['LOC']), pd.DataFrame.from_dict(res['slots']['PER']['evals'], orient='index', columns=['PER']), pd.DataFrame.from_dict(res['slots']['ORG']['evals'], orient='index', columns=['ORG']), ], axis=1).T stats = \ pd.concat( [pd.DataFrame.from_dict(res['overall']['stats'], orient='index', columns=['ALL']), pd.DataFrame.from_dict(res['slots']['LOC']['stats'], orient='index', columns=['LOC']), pd.DataFrame.from_dict(res['slots']['PER']['stats'], orient='index', columns=['PER']), pd.DataFrame.from_dict(res['slots']['ORG']['stats'], orient='index', columns=['ORG'])], axis=1, sort=True).T evals['epoch'] = ep stats['epoch'] = ep results.append( pd.concat([ evals.reset_index().set_index(['index', 'epoch']), stats.reset_index().set_index(['index', 'epoch']) ], axis=1)) if output_eval_file is not None: pd.concat(results).to_pickle(output_eval_file) results = pd.concat(results) print(results) return results