def __init__(self, model_weight_filename=None): """ Load an instance of BERT model for dimension classification. """ self.num_labels = len(DimensionDataset.label2idx) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") logging.info('*** Instantiate model ***') if model_weight_filename: config = BertConfig(vocab_size_or_config_json_file=30522, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) self.model = BertForTokenClassification(config, self.num_labels) logging.info('*** Loading model weights ***') self.model.load_state_dict( torch.load(model_weight_filename, map_location=self.device)) else: # load bert pretrained with empty token classification top layers self.model = BertForTokenClassification.from_pretrained( "bert-base-uncased", num_labels=self.num_labels) logging.info('*** Loading tokenizer ***') self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def load_pretrained_model_tokenizer(model_type="BertForSequenceClassification", base_model=None, base_tokenizer=None, device="cuda", chinese=False, num_labels=2): # Load pre-trained model (weights) if base_model is None: if chinese: base_model = "bert-base-chinese" else: base_model = "bert-base-uncased" if model_type == "BertForSequenceClassification": model = BertForSequenceClassification.from_pretrained( base_model, num_labels=num_labels) # Load pre-trained model tokenizer (vocabulary) elif model_type == "BertForNextSentencePrediction": model = BertForNextSentencePrediction.from_pretrained(base_model) elif model_type == "BertForTokenClassification": model = BertForTokenClassification.from_pretrained( base_model, num_labels=num_labels) elif model_type == "BertMSE": model = BertMSE() else: print("[Error]: unsupported model type") return None, None if base_tokenizer is None: # Download from huggingface tokenizer = BertTokenizer.from_pretrained(base_model) else: # Load local file tokenizer = BertTokenizer.from_pretrained(base_tokenizer) model.to(device) return model, tokenizer
def predict(name, lang='eng', path='learn', model_dir='models'): path, model_dir = Path(path), Path(model_dir) print('Loading model...') device = 'cpu' state = torch.load(path / model_dir / f'{name}.pth', map_location=device) bert_model = 'bert-base-cased' if lang == 'eng' else 'bert-base-multilingual-cased' print(f'Lang: {lang}\nModel: {bert_model}\nRun: {name}') model = BertForTokenClassification.from_pretrained(bert_model, num_labels=len(VOCAB), cache_dir='bertm') model.load_state_dict(state['model'], strict=True) print('Done') try: while True: # get sentence sent = input('Enter sentence: ') words = sent.split() x, mask = to_feature(words, bert_model) with torch.no_grad(): # predict named entities out = model(x) pred = out.argmax(-1).view(-1) print(pred) active_pred = pred[mask == 1] print('Named Entities') active_pred = active_pred.tolist() for w, l in zip(words, active_pred[1:-1]): print(f'{w} {idx2label[l]}') except Exception as e: print('See ya')
def test(config): print('-' * 50) print('Loading core model......') load_model_name = config['test_model'] if os.path.exists(config['test_model']): pass else: print('the test model ' + config['test_model'] + ' is not exist pls check it in the config.txt') print('Core model name is : ' + load_model_name) model = BertForTokenClassification.from_pretrained( config['model_name'], num_labels=config['tagset_size'] + 1) # load ptm print('-' * 50) print('Deploying the test data......') config['shuffle'] = False test_sents, test_data_loader = prepare_data(config) print('Test data loaded done!') model.to(config['device']) checkpoint = torch.load(load_model_name) model.load_state_dict(checkpoint['net']) model.eval() print('-' * 50) print('core model loaded done! Start predicting......') with torch.no_grad(): results = predict(model, test_data_loader, config) results = restore_result(test_sents, results) if config['output_file']: write(results, config['output_file']) else: print('The output file is not pointed. pls check the config.txt') sys.exit(1) print('Test process done!')
def __init__(self, num_labels): super(NER1, self).__init__() self.bert = BertForTokenClassification.from_pretrained('bert-base-chinese', num_labels=num_labels) for param in self.bert.parameters(): param.requires_grad = True self.dp1 = nn.Dropout(0.1)
def __init__(self, args, params, device): super(Net, self).__init__() self.bert = BertForTokenClassification.from_pretrained( args.bert_model_dir, num_labels=len(params.tag2idx)) self.bilstm = nn.LSTM(bidirectional=True, num_layers=2, input_size=768, hidden_size=768 // 2, batch_first=True) self.fc = nn.Linear(768, len(params.tag2idx)) self.num_labels = 2 self.device = device
def main(): slot2Id = getSlot2Id() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = BertForTokenClassification.from_pretrained( config.pretrained_model_name_or_path, num_labels=len(slot2Id)) model.to(device) x, y = processData() train_dataloader, val_dataloader = getDataLoader(x, y) train(model, device, train_dataloader, val_dataloader, config.epochs, config.max_grad_norm)
def __init__(self, dir_path, max_seq_length=30): self.max_seq_length = max_seq_length self.processor = NerProcessor.load( os.path.join(dir_path, PROCESSOR_NAME)) self.tokenizer = BertTokenizer.from_pretrained(dir_path) self.classifier = BertForTokenClassification.from_pretrained( dir_path, len(self.processor.labels)) self.classifier.eval() self.id2label = { i: label for i, label in enumerate(self.processor.labels) } global debug_message debug_message = False
def __init__(self, opt): super(BERT_REL, self).__init__() self.opt = opt self.bertForToken = BertForTokenClassification.from_pretrained(self.opt.bert_model_dir, num_labels=self.opt.tag_nums) # tag分类 self.num_labels = self.opt.tag_nums # 关系分类 self.rel_bert = BertModel.from_pretrained(self.opt.bert_model_dir) self.rel_fc = nn.Sequential(nn.Linear(768, 1024), nn.ReLU(), nn.Linear(1024, self.opt.rel_nums)) self.id2tag = json.loads(open(opt.id2tag_dir, 'r').readline()) self.type2types = json.loads(open(opt.type2types_dir, 'r').readline()) self.sep1 = torch.LongTensor([1]).to("cuda") self.sep2 = torch.LongTensor([2]).to("cuda") self.init_weights()
def BuildModel(config, weight=None): # change the forward method: do not consider 'X' when computing loss def new_forward(self, input_ids, token_type_ids=None, attention_mask=None, add_masks=None, labels=None, weight=weight): sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False) sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) if labels is not None: if weight is not None: weight = weight.to(torch.float).to(config['device']) loss_fct = nn.CrossEntropyLoss(weight=weight, ignore_index=self.num_labels - 1) # Only keep active parts of the loss if attention_mask is not None or add_masks is not None: if add_masks is None: add_masks = 1 if attention_mask is None: attention_mask = 1 active_loss = (attention_mask.view(-1) == 1) * (add_masks.view(-1) == 1) active_logits = logits.view(-1, self.num_labels)[active_loss] active_labels = labels.view(-1)[active_loss] loss = loss_fct(active_logits, active_labels) else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) return loss else: return logits BertForTokenClassification.forward = new_forward model = BertForTokenClassification.from_pretrained( config['name'], num_labels=config['num_labels']) model.to(config['device']) return model
def __init__(self, opt): super(BERT_CNN_CRF, self).__init__() self.opt = opt self.bertForToken = BertForTokenClassification.from_pretrained(self.opt.bert_model_dir, num_labels=self.opt.tag_nums) # tag分类 self.num_labels = self.opt.tag_nums self.crf = CRF(self.opt.tag_nums, batch_first=True) # 关系分类 self.type_emb = nn.Embedding(3, self.opt.bert_hidden_size) self.rel_cnns = Encoder(enc_method='cnn', filters_num=self.opt.filter_num, filters=self.opt.filters, f_dim=self.opt.bert_hidden_size) self.classifier_rels = nn.Linear(len(self.opt.filters)*self.opt.filter_num, self.opt.rel_nums) self.id2tag = json.loads(open(opt.id2tag_dir, 'r').readline()) self.type2types = json.loads(open(opt.type2types_dir, 'r').readline()) self.init_weights()
def keywordextract(sentence, model_path='./pretrained/keyword_extraction_pretrained.pt'): # returns a single keyword of given sentence device = torch.device('cpu') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=3) model.to(device) text = sentence tkns = tokenizer.tokenize(text) indexed_tokens = tokenizer.convert_tokens_to_ids(tkns) segments_ids = [0] * len(tkns) tokens_tensor = torch.tensor([indexed_tokens]).to(device) segments_tensors = torch.tensor([segments_ids]).to(device) model = torch.load(model_path, map_location=device) model.eval() prediction = [] logit = model(tokens_tensor, token_type_ids=None, attention_mask=segments_tensors) #logit = model(tokens_tensor) logit = logit.detach().cpu().numpy() prediction.extend([list(p) for p in np.argmax(logit, axis=2)]) keyword = None for k, j in enumerate(prediction[0]): if j == 1 or j == 0: # print(tokenizer.convert_ids_to_tokens(tokens_tensor[0].to('cpu').numpy())[k]) keyword = tokenizer.convert_ids_to_tokens( tokens_tensor[0].to('cpu').numpy())[k] if "#" in keyword: keyword = keyword.replace("#", "") for word in sentence.split(): if keyword in word: keyword = word.lower() return keyword
def train(): model_path = dir_path + '/models/' print('your model would be saved at', model_path) model = BertForTokenClassification.from_pretrained( "bert-base-multilingual-cased", num_labels=len(bert_io.tag2idx)) model.to(device) trn_data = bert_io.convert_to_bert_input(trn) sampler = RandomSampler(trn_data) trn_dataloader = DataLoader(trn_data, sampler=sampler, batch_size=batch_size) # load optimizer FULL_FINETUNING = True if FULL_FINETUNING: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] else: param_optimizer = list(model.classifier.named_parameters()) optimizer_grouped_parameters = [{ "params": [p for n, p in param_optimizer] }] optimizer = Adam(optimizer_grouped_parameters, lr=3e-5) # train epochs = 10 max_grad_norm = 1.0 num_of_epoch = 0 for _ in trange(epochs, desc="Epoch"): # TRAIN loop model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(trn_dataloader): # add batch to gpu batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_orig_tok_to_maps, b_input_args, b_input_masks = batch # forward pass loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_masks, labels=b_input_args) # backward pass loss.backward() # track train loss tr_loss += loss.item() nb_tr_examples += b_input_ids.size(0) nb_tr_steps += 1 # gradient clipping torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm) # update parameters optimizer.step() model.zero_grad() # break # break # print train loss per epoch print("Train loss: {}".format(tr_loss / nb_tr_steps)) model_saved_path = model_path + 'ko-srl-epoch-' + str( num_of_epoch) + '.pt' torch.save(model, model_saved_path) num_of_epoch += 1 print('...training is done')
def main(): """Training pipeline""" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() data = pd.read_csv("train_m.txt", sep='\t', encoding="latin1").fillna(method="ffill") getter = SentenceGetter(data) sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences] labels = [[s[1] for s in sent] for sent in getter.sentences] tags_vals = list(set(data["tag"].values)) tag2idx = {t: i for i, t in enumerate(tags_vals)} tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences] input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts], maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels], maxlen=MAX_LEN, value=tag2idx["O"], padding="post", dtype="long", truncating="post") attention_masks = [[float(i>0) for i in ii] for ii in input_ids] tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, random_state=2018, test_size=0.1) tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=2018, test_size=0.1) tr_inputs = torch.tensor(tr_inputs) val_inputs = torch.tensor(val_inputs) tr_tags = torch.tensor(tr_tags) val_tags = torch.tensor(val_tags) tr_masks = torch.tensor(tr_masks) val_masks = torch.tensor(val_masks) train_data = TensorDataset(tr_inputs, tr_masks, tr_tags) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs) valid_data = TensorDataset(val_inputs, val_masks, val_tags) valid_sampler = SequentialSampler(valid_data) valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs) model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(tag2idx)) model.cuda(); FULL_FINETUNING = True if FULL_FINETUNING: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] else: param_optimizer = list(model.classifier.named_parameters()) optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}] optimizer = Adam(optimizer_grouped_parameters, lr=3e-5) epochs = 5 max_grad_norm = 1.0 for _ in trange(epochs, desc="Epoch"): # TRAIN loop model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): # add batch to gpu batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask, b_labels = batch # forward pass loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) # backward pass loss.backward() # track train loss tr_loss += loss.item() nb_tr_examples += b_input_ids.size(0) nb_tr_steps += 1 # gradient clipping torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm) # update parameters optimizer.step() model.zero_grad() # print train loss per epoch print("Train loss: {}".format(tr_loss/nb_tr_steps)) # VALIDATION on validation set model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 predictions , true_labels = [], [] for batch in valid_dataloader: batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask, b_labels = batch with torch.no_grad(): tmp_eval_loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() predictions.extend([list(p) for p in np.argmax(logits, axis=2)]) true_labels.append(label_ids) tmp_eval_accuracy = flat_accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += b_input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss/nb_eval_steps print("Validation loss: {}".format(eval_loss)) print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps)) pred_tags = [tags_vals[p_i] for p in predictions for p_i in p] valid_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i] print("F1-Score: {}".format(f1_score(pred_tags, valid_tags))) print("Precision-Score: {}".format(precision_score(pred_tags, valid_tags))) print("Recall-Score: {}".format(recall_score(pred_tags, valid_tags))) print(classification_report(pred_tags, valid_tags)) model.eval() predictions = [] true_labels = [] eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in valid_dataloader: batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask, b_labels = batch with torch.no_grad(): tmp_eval_loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = logits.detach().cpu().numpy() predictions.extend([list(p) for p in np.argmax(logits, axis=2)]) label_ids = b_labels.to('cpu').numpy() true_labels.append(label_ids) tmp_eval_accuracy = flat_accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += b_input_ids.size(0) nb_eval_steps += 1 pred_tags = [tags_vals[p_i] for p in predictions for p_i in p] valid_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i] print("Validation loss: {}".format(eval_loss/nb_eval_steps)) print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps)) print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags))) print("Precision-Score: {}".format(precision_score(pred_tags, valid_tags))) print("Recall-Score: {}".format(recall_score(pred_tags, valid_tags))) print(classification_report(pred_tags, valid_tags)) true_positives_O = 0 predicted_positives_O = 0 real_positives_O = 0 for pred, valid in zip(pred_tags, valid_tags): if pred == 'I-claim' and valid == 'I-claim': true_positives_O += 1 if pred == 'I-claim': predicted_positives_O += 1 if valid == 'I-claim': real_positives_O += 1 print("True positives I: {}".format(true_positives_O)) print("predicted positives I: {}".format(predicted_positives_O)) print("real positives I: {}".format(real_positives_O)) true_positives_B = 0 predicted_positives_B = 0 real_positives_B = 0 for pred, valid in zip(pred_tags, valid_tags): if pred == 'B-claim' and valid == 'B-claim': true_positives_B += 1 if pred == 'B-claim': predicted_positives_B += 1 if valid == 'B-claim': real_positives_B += 1 print("True positives B: {}".format(true_positives_B)) print("predicted positives B: {}".format(predicted_positives_B)) print("real positives B: {}".format(real_positives_B)) with open("resultados", 'w') as out: out.write("Predictions:\n") out.write("{}".format(list(zip(list(val_inputs), pred_tags, valid_tags))))
def run_ner( lang: str = 'eng', log_dir: str = 'logs', task: str = NER, batch_size: int = 1, epochs: int = 1, dataset: str = 'data/conll-2003/', loss: str = 'cross', max_seq_len: int = 128, do_lower_case: bool = False, warmup_proportion: float = 0.1, rand_seed: int = None, ds_size: int = None, data_bunch_path: str = 'data/conll-2003/db', tuned_learner: str = None, do_train: str = False, do_eval: str = False, save: bool = False, nameX: str = 'ner', mask: tuple = ('s', 's'), ): name = "_".join( map(str, [ nameX, task, lang, mask[0], mask[1], loss, batch_size, max_seq_len, do_train, do_eval ])) log_dir = Path(log_dir) log_dir.mkdir(parents=True, exist_ok=True) init_logger(log_dir, name) if rand_seed: random.seed(rand_seed) np.random.seed(rand_seed) torch.manual_seed(rand_seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(rand_seed) trainset = dataset + lang + '/train.txt' devset = dataset + lang + '/dev.txt' testset = dataset + lang + '/test.txt' bert_model = 'bert-base-cased' if lang == 'eng' else 'bert-base-multilingual-cased' print(f'Lang: {lang}\nModel: {bert_model}\nRun: {name}') model = BertForTokenClassification.from_pretrained(bert_model, num_labels=len(VOCAB), cache_dir='bertm') if tuned_learner: print('Loading pretrained learner: ', tuned_learner) model.bert.load_state_dict(torch.load(tuned_learner)) model = torch.nn.DataParallel(model) model_lr_group = bert_layer_list(model) layers = len(model_lr_group) kwargs = {'max_seq_len': max_seq_len, 'ds_size': ds_size, 'mask': mask} train_dl = DataLoader(dataset=NerDataset(trainset, bert_model, train=True, **kwargs), batch_size=batch_size, shuffle=True, collate_fn=partial(pad, train=True)) dev_dl = DataLoader(dataset=NerDataset(devset, bert_model, **kwargs), batch_size=batch_size, shuffle=False, collate_fn=pad) test_dl = DataLoader(dataset=NerDataset(testset, bert_model, **kwargs), batch_size=batch_size, shuffle=False, collate_fn=pad) data = DataBunch(train_dl=train_dl, valid_dl=dev_dl, test_dl=test_dl, collate_fn=pad, path=Path(data_bunch_path)) train_opt_steps = int(len(train_dl.dataset) / batch_size) * epochs optim = BertAdam(model.parameters(), lr=0.01, warmup=warmup_proportion, t_total=train_opt_steps) loss_fun = ner_loss_func if loss == 'cross' else partial(ner_loss_func, zero=True) metrics = [Conll_F1()] learn = Learner( data, model, BertAdam, loss_func=loss_fun, metrics=metrics, true_wd=False, layer_groups=model_lr_group, path='learn' + nameX, ) learn.opt = OptimWrapper(optim) lrm = 1.6 # select set of starting lrs lrs_eng = [0.01, 5e-4, 3e-4, 3e-4, 1e-5] lrs_deu = [0.01, 5e-4, 5e-4, 3e-4, 2e-5] startlr = lrs_eng if lang == 'eng' else lrs_deu results = [['epoch', 'lr', 'f1', 'val_loss', 'train_loss', 'train_losses']] if do_train: learn.freeze() learn.fit_one_cycle(1, startlr[0], moms=(0.8, 0.7)) learn.freeze_to(-3) lrs = learn.lr_range(slice(startlr[1] / (1.6**15), startlr[1])) learn.fit_one_cycle(1, lrs, moms=(0.8, 0.7)) learn.freeze_to(-6) lrs = learn.lr_range(slice(startlr[2] / (1.6**15), startlr[2])) learn.fit_one_cycle(1, lrs, moms=(0.8, 0.7)) learn.freeze_to(-12) lrs = learn.lr_range(slice(startlr[3] / (1.6**15), startlr[3])) learn.fit_one_cycle(1, lrs, moms=(0.8, 0.7)) learn.unfreeze() lrs = learn.lr_range(slice(startlr[4] / (1.6**15), startlr[4])) learn.fit_one_cycle(1, lrs, moms=(0.8, 0.7)) if do_eval: res = learn.validate(test_dl, metrics=metrics) met_res = [f'{m.__name__}: {r}' for m, r in zip(metrics, res[1:])] print(f'Validation on TEST SET:\nloss {res[0]}, {met_res}') results.append(['val', '-', res[1], res[0], '-', '-']) with open(log_dir / (name + '.csv'), 'a') as resultFile: wr = csv.writer(resultFile) wr.writerows(results)
data_loader = DataLoader(args.data_dir, args.bert_model_dir, params, token_pad_idx=0) # Load training data and test data train_data = data_loader.load_data('train') val_data = data_loader.load_data('val') # Specify the training and validation dataset sizes params.train_size = train_data['size'] params.val_size = val_data['size'] # Prepare model model = BertForTokenClassification.from_pretrained(args.bert_model_dir, num_labels=len( params.tag2idx)) model.to(params.device) if args.fp16: model.half() if params.n_gpu > 1 and args.multi_gpu: model = torch.nn.DataParallel(model) # Prepare optimizer if params.full_finetuning: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] # no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [
def __init__(self, opt): super(BertNer, self).__init__() self.opt = opt self.num_labels = self.opt.tag_nums self.bertForToken = BertForTokenClassification.from_pretrained( self.opt.bert_model_dir, num_labels=self.opt.tag_nums)
help='path to save the final model') args = parser.parse_args() if args.lang == "en": bert_model = "bert-base-uncased" model_path = "/workdir/pretrain-model/bert-torch" elif args.lang == "cn": bert_model = "bert-base-chinese" model_path = "/workdir/pretrain-model/bert-torch-cn" bert_model = "bert-base-uncased" # model_path = "D:/Github/BERT-Keyword-Extractor/model/en_model.pt" tag2idx = {'B': 0, 'I': 1, 'O': 2} tags_vals = ['B', 'I', 'O'] tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=True) # cache_dir='/workdir/pretrain-model/bert-torch-cn') model = BertForTokenClassification.from_pretrained(bert_model, num_labels=len(tag2idx)) # cache_dir='/workdir/pretrain-model/bert-torch-cn') def casting(tkns, prediction): # 把命中peace的值扩张到整个单词 for k, j in enumerate(prediction): if j == 1 or j == 0: # 中间是短词 if not tkns[k].find('##') == -1: prediction[k] = 1 forwd = False backward = False for i in range(int(len(tkns) / 2)): # forward if k - i >= 0:
def train(args: Dict): MAX_LEN = int(args['--max-len']) bs = int(args['--batch-size']) model_root = args['--model-root'] if args['--model-root'] else './models' dataLoader= sentence.Sentence(args['--train-src']) device = torch.device("cuda:0" if args['--cuda'] else "cpu") print('use device: %s' % device, file=sys.stderr) if args['--cuda']: n_gpu = torch.cuda.device_count() torch.cuda.get_device_name(0) tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False) tokenized_texts = [tokenizer.tokenize(sent) for sent in dataLoader.sentences] print(dataLoader.sentences[0]) print(tokenized_texts[0]) input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts], maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") tags = pad_sequences([[dataLoader.tag2idx.get(l) for l in lab] for lab in dataLoader.labels], maxlen=MAX_LEN, value=dataLoader.tag2idx["O"], padding="post", dtype="long", truncating="post") attention_masks = [[float(i > 0) for i in ii] for ii in input_ids] """ The BERT Model requires us to have a [SEP] token at the end of each sentence as a part of its preprocessing. 102 is the index BERT recognizes as the index of [SEP]. Hence, I am adding it to the end of the sentence after padding/truncating (as it might have been removed if the sequences were greater than 75 in length) to be compatible with BERT's requirement. I didn't have it in the beginning and I thought it would be the reason for the poor results but changing it didn't help and I chose to keep it anyways as it felt right. :) """ for i, inp in enumerate(input_ids): if (102 not in inp): inp[-1] = 102 tags[i][-1] = dataLoader.tag2idx.get("O") tts = float(args['--train-test-split']) tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, random_state=10, test_size=tts) tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=10, test_size=tts) tr_inputs = torch.tensor(tr_inputs).to(torch.int64) val_inputs = torch.tensor(val_inputs).to(torch.int64) tr_tags = torch.tensor(tr_tags).to(torch.int64) val_tags = torch.tensor(val_tags).to(torch.int64) tr_masks = torch.tensor(tr_masks) val_masks = torch.tensor(val_masks) train_data = TensorDataset(tr_inputs, tr_masks, tr_tags) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs) valid_data = TensorDataset(val_inputs, val_masks, val_tags) valid_sampler = SequentialSampler(valid_data) valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs) model = BertForTokenClassification.from_pretrained( "bert-base-multilingual-cased", num_labels=len(dataLoader.tag2idx)) if args['--cuda']: model.cuda() FULL_FINETUNING = True if args['--full-finetuning'] else False if FULL_FINETUNING: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] else: param_optimizer = list(model.classifier.named_parameters()) optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}] optimizer = Adam(optimizer_grouped_parameters, lr=float(args['--lr'])) epochs = int(args['--max-epoch']) max_grad_norm = 1.0 hist_valid_scores = [] for _ in trange(epochs, desc="Epoch"): # TRAIN loop model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): # add batch to gpu batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask, b_labels = batch # forward pass loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) # backward pass loss.backward() # track train loss tr_loss += loss.item() nb_tr_examples += b_input_ids.size(0) nb_tr_steps += 1 # gradient clipping torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm) # update parameters optimizer.step() model.zero_grad() # print train loss per epoch print("Train loss: {}".format(tr_loss / nb_tr_steps)) # VALIDATION on validation set model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 predictions, true_labels = [], [] for batch in valid_dataloader: batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask, b_labels = batch with torch.no_grad(): tmp_eval_loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() predictions.extend([list(p) for p in np.argmax(logits, axis=2)]) true_labels.append(label_ids) tmp_eval_accuracy = flat_accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += b_input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps print("Validation loss: {}".format(eval_loss)) print("Validation Accuracy: {}".format(eval_accuracy / nb_eval_steps)) pred_tags = [dataLoader.tags_vals[p_i] for p in predictions for p_i in p] valid_tags = [dataLoader.tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i] f1=f1_score(valid_tags,pred_tags) print("F1-Score: {}".format(f1)) is_better = len(hist_valid_scores) == 0 or f1 > max(hist_valid_scores) hist_valid_scores.append(f1) if is_better: output_model_file = os.path.join(model_root, "model_file.bin") output_config_file = os.path.join(model_root, "config_file.bin") output_vocab_file = model_root model_to_save = model.module if hasattr(model, 'module') else model torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(output_vocab_file) print('reached maximum number of epochs!', file=sys.stderr) exit(0)
device=device) training_dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) training_sampler = RandomSampler(training_dataset) training_dataloader = DataLoader(training_dataset, sampler=training_sampler, batch_size=training_batch_size) num_train_steps = int( len(training_features) / training_batch_size / gradient_accumulation_steps * num_train_epochs) t_total = num_train_steps if if_bert: model = BertForTokenClassification.from_pretrained( '../../model/bert-base-chinese.tar.gz', num_labels=3).to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }]
def run_ner( lang: str = 'eng', log_dir: str = 'logs', task: str = NER, batch_size: int = 1, lr: float = 5e-5, epochs: int = 1, dataset: str = 'data/conll-2003/', loss: str = 'cross', max_seq_len: int = 128, do_lower_case: bool = False, warmup_proportion: float = 0.1, grad_acc_steps: int = 1, rand_seed: int = None, fp16: bool = False, loss_scale: float = None, ds_size: int = None, data_bunch_path: str = 'data/conll-2003/db', bertAdam: bool = False, freez: bool = False, one_cycle: bool = False, discr: bool = False, lrm: int = 2.6, div: int = None, tuned_learner: str = None, do_train: str = False, do_eval: str = False, save: bool = False, name: str = 'ner', mask: tuple = ('s', 's'), ): name = "_".join( map(str, [ name, task, lang, mask[0], mask[1], loss, batch_size, lr, max_seq_len, do_train, do_eval ])) log_dir = Path(log_dir) log_dir.mkdir(parents=True, exist_ok=True) init_logger(log_dir, name) if rand_seed: random.seed(rand_seed) np.random.seed(rand_seed) torch.manual_seed(rand_seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(rand_seed) trainset = dataset + lang + '/train.txt' devset = dataset + lang + '/dev.txt' testset = dataset + lang + '/test.txt' bert_model = 'bert-base-cased' if lang == 'eng' else 'bert-base-multilingual-cased' print(f'Lang: {lang}\nModel: {bert_model}\nRun: {name}') model = BertForTokenClassification.from_pretrained(bert_model, num_labels=len(VOCAB), cache_dir='bertm') model = torch.nn.DataParallel(model) model_lr_group = bert_layer_list(model) layers = len(model_lr_group) kwargs = {'max_seq_len': max_seq_len, 'ds_size': ds_size, 'mask': mask} train_dl = DataLoader(dataset=NerDataset(trainset, bert_model, train=True, **kwargs), batch_size=batch_size, shuffle=True, collate_fn=partial(pad, train=True)) dev_dl = DataLoader(dataset=NerDataset(devset, bert_model, **kwargs), batch_size=batch_size, shuffle=False, collate_fn=pad) test_dl = DataLoader(dataset=NerDataset(testset, bert_model, **kwargs), batch_size=batch_size, shuffle=False, collate_fn=pad) data = DataBunch(train_dl=train_dl, valid_dl=dev_dl, test_dl=test_dl, collate_fn=pad, path=Path(data_bunch_path)) loss_fun = ner_loss_func if loss == 'cross' else partial(ner_loss_func, zero=True) metrics = [Conll_F1()] learn = Learner( data, model, BertAdam, loss_func=loss_fun, metrics=metrics, true_wd=False, layer_groups=None if not freez else model_lr_group, path='learn', ) # initialise bert adam optimiser train_opt_steps = int(len(train_dl.dataset) / batch_size) * epochs optim = BertAdam(model.parameters(), lr=lr, warmup=warmup_proportion, t_total=train_opt_steps) if bertAdam: learn.opt = OptimWrapper(optim) else: print("No Bert Adam") # load fine-tuned learner if tuned_learner: print('Loading pretrained learner: ', tuned_learner) learn.load(tuned_learner) # Uncomment to graph learning rate plot # learn.lr_find() # learn.recorder.plot(skip_end=15) # set lr (discriminative learning rates) if div: layers = div lrs = lr if not discr else learn.lr_range(slice(lr / lrm**(layers), lr)) results = [['epoch', 'lr', 'f1', 'val_loss', 'train_loss', 'train_losses']] if do_train: for epoch in range(epochs): if freez: lay = (layers // (epochs - 1)) * epoch * -1 if lay == 0: print('Freeze') learn.freeze() elif lay == layers: print('unfreeze') learn.unfreeze() else: print('freeze2') learn.freeze_to(lay) print('Freezing layers ', lay, ' off ', layers) # Fit Learner - eg train model if one_cycle: learn.fit_one_cycle(1, lrs, moms=(0.8, 0.7)) else: learn.fit(1, lrs) results.append([ epoch, lrs, learn.recorder.metrics[0][0], learn.recorder.val_losses[0], np.array(learn.recorder.losses).mean(), learn.recorder.losses, ]) if save: m_path = learn.save(f"{lang}_{epoch}_model", return_path=True) print(f'Saved model to {m_path}') if save: learn.export(f'{lang}.pkl') if do_eval: res = learn.validate(test_dl, metrics=metrics) met_res = [f'{m.__name__}: {r}' for m, r in zip(metrics, res[1:])] print(f'Validation on TEST SET:\nloss {res[0]}, {met_res}') results.append(['val', '-', res[1], res[0], '-', '-']) with open(log_dir / (name + '.csv'), 'a') as resultFile: wr = csv.writer(resultFile) wr.writerows(results)
def build_model(full_fine_tunning=True, batch_size=32, epochs=3): df = load_data(DATA_PATH) df["word"] = df["word"].str.lower() data = getter(df) processed_texts = [] processed_tags = [] for item in data: string, tags = process_terms(item) processed_texts.append(string) processed_tags.append(tags) tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True) tokenized_sents = [ tokenizer.tokenize("[CLS] " + sent + " [SEP]") for sent in processed_texts ] tokenized_tags = label_tokenize(tokenized_sents, processed_tags) bert_sents, bert_labels = remove_long_sent(tokenized_sents, tokenized_tags) # indexing input_ids = [tokenizer.convert_tokens_to_ids(x) for x in bert_sents] label_ids = [[LABELS.get(l) for l in lab] for lab in bert_labels] input_ids_pad = pad_sequences( input_ids, maxlen=BERT_INPUT_SEQUENCE_LENGTH, dtype="long", truncating="post", padding="post", ) labels_ids_pad = pad_sequences( label_ids, maxlen=BERT_INPUT_SEQUENCE_LENGTH, value=LABELS["O"], dtype="long", truncating="post", padding="post", ) attention_masks = [] for seq in input_ids_pad: mask = [float(i > 0) for i in seq] attention_masks.append(mask) train_data = TensorDataset( torch.tensor(input_ids_pad), torch.tensor(attention_masks), torch.tensor(labels_ids_pad), ) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print("GPU: {}".format(device)) print("Number of GPUs: {}".format(n_gpu)) if device == torch.device("cuda"): board = torch.cuda.get_device_name() print("Board: {}".format(board)) model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(LABELS)) if device == torch.device("cuda"): model.cuda() if full_fine_tunning: param_optimizer = list(model.named_parameters()) no_decay = ["bias", "gamma", "beta"] optimizer_grouped_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay_rate": 0.01, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay_rate": 0.0, }, ] else: param_optimizer = list(model.classifier.named_parameters()) optimizer_grouped_parameters = [{ "params": [p for n, p in param_optimizer] }] optimizer = BertAdam(optimizer_grouped_parameters, lr=5e-5, warmup=0.1) tr_loss_set = [] for epoch in range(epochs): # train model.train() tr_loss = 0 nb_tr_steps = 0 for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_masks, b_labels = batch logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_masks) loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, len(LABELS)), b_labels.view(-1)) tr_loss_set.append(loss.item()) loss.backward() # gradient clipping torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=1.0) optimizer.step() model.zero_grad() tr_loss += loss.item() nb_tr_steps += 1 print(f"# of EPOCH: {epoch}") print("Train loss: {}".format(tr_loss / nb_tr_steps)) torch.save(model.state_dict(), str(MODEL_PATH)) model.config.to_json_file(MODEL_CONFIG_PATH)
def load_token_classifier(self, classifier_model_name, tag2idx): self.model = BertForTokenClassification.from_pretrained(classifier_model_name, num_labels=len(tag2idx))
### CONFIG ### MAX_LEN = 50 BATCH_SIZE = 32 TRAIN_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/train-improved-sentiwordnet-arguingfullindiv-pos.tsv?token=AD7GEDK3MI27HVJPQWOE74C6FBZHA' DEV_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/dev-improved-sentiwordnet-arguingfullindiv-pos.tsv?token=AD7GEDM3LOMZM6MP4HZS4MS6FBZHK' EPOCHS = 3 MAX_GRAD_NORM = 1.0 ############## device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print(torch.cuda.get_device_name(0)) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=2) model.cuda() def get_comments(filename, url=True): if url: comments = [] with urllib.request.urlopen(filename) as f: for line in f: if line.startswith(b'#'): comments.append(line.decode("utf-8")) else: break return comments with open(filename, 'r', encoding='utf8') as f: commentiter = takewhile(lambda s: s.startswith('#'), f)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default='/home/adzuser/user_achyuta/BERT_NER_Test/BERT-NER/NERdata/', type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default='NER', type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default='ner_output', type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_test", action='store_true', help="Whether to run test on the test set.") parser.add_argument("--do_pred", action='store_true', help="Whether to run pred on the pred set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument( "--num_train_epochs", default=4.0, #3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--clip', type=float, default=0.5, help="gradient clipping") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--text_a', type=str, default='', help="input text_a.") parser.add_argument('--text_b', type=str, default='', help="input text_b.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = {"ner": NerProcessor} num_labels_task = { "ner": 17 #6#12 } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval and not args.do_pred: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) #train_examples = train_examples[:1000] print("train_examples :: ", len(list(train_examples))) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format( args.local_rank)) #imodel = BertForSequenceClassification.from_pretrained(args.bert_model, # cache_dir=cache_dir, # num_labels = num_labels) model = BertForTokenClassification.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() #model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) #all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) #all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) #all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) #all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_input_ids = [f.input_ids for f in train_features] all_input_mask = [f.input_mask for f in train_features] all_segment_ids = [f.segment_ids for f in train_features] all_label_ids = [f.label_id for f in train_features] # convert to cuda #all_input_ids = all_input_ids.to(device) #all_input_mask = all_input_mask.to(device) #all_segment_ids = all_segment_ids.to(device) #all_label_ids = all_label_ids.to(device) #train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) #if args.local_rank == -1: # train_sampler = RandomSampler(train_data) #else: # train_sampler = DistributedSampler(train_data) #train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) # create model #model.train() model = KerasClassifier(build_fn=create_model(model), verbose=0) # define the grid search parameters batch_size = [10, 20, 40, 60, 80, 100] epochs = [10, 50, 100] param_grid = dict(batch_size=batch_size, epochs=epochs) grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1) #keras_input = KerasInputFormatter([ # ('input_one', all_input_ids), # ('input_two', all_segment_ids), # ('input_three', all_input_mask),]) #input_one = Input(name = 'input_one', shape = (128,)) #input_two = Input(name = 'input_two', shape = (128,)) #input_three = Input(name = 'input_three', shape = (128,)) #input_four = Input(name = 'input_four', shape = (128,)) #output = y # define model here #self.model = Model(inputs = [input_one, input_two, input_three], outputs = output) grid_result = grid.fit( [[all_input_ids, all_segment_ids, all_input_mask]], [all_label_ids]) # summarize results print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) means = grid_result.cv_results_['mean_test_score'] stds = grid_result.cv_results_['std_test_score'] params = grid_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, param)) ''' model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch #print(input_ids.shape,input_mask.shape,segment_ids.shape,label_ids.shape) #print(input_ids[0]) #print(label_ids[0]) #logits = model(input_ids, segment_ids, input_mask) #import pdb;pdb.set_trace() #print(logits.view(-1, num_labels).shape, label_ids.view(-1).shape) loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() # added clip if args.clip is not None: _ = torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 ''' if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) #model = BertForSequenceClassification(config, num_labels=num_labels) model = BertForTokenClassification(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) else: #model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels) # Load a trained model and config that you have fine-tuned print('for eval only......................') output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) config = BertConfig(output_config_file) #model = BertForSequenceClassification(config, num_labels=num_labels) model = BertForTokenClassification(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) #import pdb;pdb.set_trace() print("dev_eaxmples :: ", len(list(eval_examples))) eval_features = convert_examples_to_features_pred( eval_examples, label_list, args.max_seq_length, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 predictions, true_labels = [], [] #predictions1 , true_labels1 = [], [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() # get index till '[SEP]' #print("label_list index SEP : ",label_list.index('[SEP]')) pred_xx = [list(p) for p in np.argmax(logits, axis=2)] pred_xx = [i[:i.index(label_list.index('[SEP]'))] for i in pred_xx] label_ids_xx = [ i[:i.index(label_list.index('[SEP]'))] for i in label_ids.tolist() ] #print(label_ids_xx) #print(pred_xx) # new add tmp_s = [ max(len(i), len(j)) for i, j in zip(label_ids_xx, pred_xx) ] tmp_u = [(i + [31] * (k - len(i)) if len(i) != k else i, j + [31] * (k - len(j)) if len(j) != k else j) for i, j, k in zip(label_ids_xx, pred_xx, tmp_s)] tmp_d1 = [h[0] for h in tmp_u] tmp_d2 = [h[1] for h in tmp_u] #print([list(p) for p in np.argmax(logits, axis=2)][:5]) #tmp_eval_accuracy = flat_accuracy(logits, label_ids) tmp_eval_accuracy = flat_accc(pred_xx, label_ids_xx) #tmp_eval_accuracy = flat_accc(tmp_d1, tmp_d2) predictions.extend(tmp_d2) true_labels.append(tmp_d1) #predictions1.extend(pred_xx) #true_labels1.append(label_ids_xx) #print("tmp accuracy : ",tmp_eval_accuracy) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_steps loss = tr_loss / nb_tr_steps if args.do_train else None pred_tags = [[label_list[p_i] if p_i != 31 else 'XXX' for p_i in p] for p in predictions] valid_tags = [[ label_list[l_ii] if l_ii != 31 else 'YYY' for l_ii in l_i ] for l in true_labels for l_i in l] print("valid_tags : ", valid_tags[:10]) print("pred_tags : ", pred_tags[:10]) print("Validation F1-Score: {}".format(f1_score(valid_tags, pred_tags))) print("Validation accuracy_score : {}".format( accuracy_score(valid_tags, pred_tags))) print("Validation classification_report : {}".format( classification_report(valid_tags, pred_tags))) #print("X Validation F1-Score: {}".format(f1_score(true_labels1, predictions1))) #print("X Validation accuracy_score : {}".format(accuracy_score(true_labels1, predictions1))) #print("X Validation classification_report : {}".format(classification_report(true_labels1, predictions1))) result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': loss } print(result) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(result.keys()): writer.write("%s = %s\n" % (key, str(result[key]))) if args.do_test and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_test_examples(args.data_dir) print('test examples len : {}'.format(len(eval_examples))) #import pdb;pdb.set_trace() eval_features = convert_examples_to_features_pred( eval_examples, label_list, args.max_seq_length, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() test_loss, test_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 predictions, true_labels = [], [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() # get index till '[SEP]' #print("label_list index SEP : ",label_list.index('[SEP]')) pred_xx = [list(p) for p in np.argmax(logits, axis=2)] pred_xx = [i[:i.index(label_list.index('[SEP]'))] for i in pred_xx] label_ids_xx = [ i[:i.index(label_list.index('[SEP]'))] for i in label_ids.tolist() ] #print(label_ids_xx) #print(pred_xx) # new add tmp_s = [ max(len(i), len(j)) for i, j in zip(label_ids_xx, pred_xx) ] tmp_u = [(i + [31] * (k - len(i)) if len(i) != k else i, j + [31] * (k - len(j)) if len(j) != k else j) for i, j, k in zip(label_ids_xx, pred_xx, tmp_s)] tmp_d1 = [h[0] for h in tmp_u] tmp_d2 = [h[1] for h in tmp_u] #print([list(p) for p in np.argmax(logits, axis=2)][:5]) #tmp_eval_accuracy = flat_accuracy(logits, label_ids) tmp_eval_accuracy = flat_accc(pred_xx, label_ids_xx) #tmp_eval_accuracy = flat_accc(tmp_d1, tmp_d2) predictions.extend(tmp_d2) true_labels.append(tmp_d1) #print("tmp accuracy : ",tmp_eval_accuracy) test_loss += tmp_eval_loss.mean().item() test_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 test_loss = test_loss / nb_eval_steps test_accuracy = test_accuracy / nb_eval_steps loss = tr_loss / nb_tr_steps if args.do_train else None pred_tags = [[label_list[p_i] if p_i != 31 else 'XXX' for p_i in p] for p in predictions] valid_tags = [[ label_list[l_ii] if l_ii != 31 else 'YYY' for l_ii in l_i ] for l in true_labels for l_i in l] print("valid_tags : ", valid_tags[:10]) print("pred_tags : ", pred_tags[:10]) print("Test F1-Score: {}".format(f1_score(valid_tags, pred_tags))) print("Test accuracy_score : {}".format( accuracy_score(valid_tags, pred_tags))) print("Test classification_report : {}".format( classification_report(valid_tags, pred_tags))) #print("X Test F1-Score: {}".format(f1_score(true_labels, predictions))) #print("X Test accuracy_score : {}".format(accuracy_score(true_labels, predictions))) #print("X Test classification_report : {}".format(classification_report(true_labels, predictions))) result = { 'test_loss': test_loss, 'test_accuracy': test_accuracy, 'global_step': global_step, 'loss': loss } print(result) output_test_file = os.path.join(args.output_dir, "test_results.txt") with open(output_test_file, "w") as writer: for key in sorted(result.keys()): writer.write("%s = %s\n" % (key, str(result[key]))) if args.do_pred and (args.local_rank == -1 or torch.distributed.get_rank() == 0): #eval_examples = processor.get_dev_examples(args.data_dir) model.eval() while True: print( 'enter a text to get NER. otherwise press Ctrl+C to close session.' ) text_a = input('>>>') #"Japan began the defence of their Asian Cup title with a lucky 2-1 win against Syria in a Group C championship match on Friday . ." eval_examples = { 'text_a': text_a, 'text_b': "The foodservice pie business does not fit our long-term growth strategy .", 'label': '1', 'guid': '12345' } eval_features = convert_examples_to_features_test( eval_examples, label_list, args.max_seq_length, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) #model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 predictions, true_labels = [], [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() pred_xx = [list(p) for p in np.argmax(logits, axis=2)] pred_xx = [ i[:i.index(label_list.index('[SEP]'))] for i in pred_xx ] print(pred_xx) print([[label_list[p_i] if p_i != 31 else 'XXX' for p_i in p] for p in pred_xx])
def do_07_bert(data: pd.DataFrame, cv=5): # This has multiple issues, that we couldn't fix at this moment # TODO: Try with cased model # TODO: Try with data already tokenized by BERT tokenizer # TODO: Fix Out uf memory error if possible model_name = 'bert-base-uncased' getter = SentenceGetter(data) # we need actual sentences this time as bert provides a tagger we will re-use sentences = [ " ".join([s[0] for s in sentence]) for sentence in getter.sentences ] labels = [[s[2] for s in sent] for sent in getter.sentences] tags_vals = list(set(data["Tag"].values)) tag2idx = {t: i for i, t in enumerate(tags_vals)} device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() bs = batch_size = 32 # use berts tokenizer tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True) tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences] MAX_LEN = max(len(s) for s in tokenized_texts) print("MAX_LEN: %d" % MAX_LEN) # NOTE: The tutorial seems to assume that bert and our input are basically tokenized # to the same units making the labels still applicable to the input # Result of the below for the original texts is: # Mean: 2.68, differing: 0.73 # for i in [4, 7, 58, 1200]: # print(sentences[i]) # print(tokenized_texts[i]) # print("---") # differences = [len(tokenized_texts[i]) - len(labels[i]) for i in range(len(sentences))] # differences = [d * -1 if d < 0 else d for d in differences] # mean = sum(differences) / len(differences) # print("Mean: %.2f, differing: %.2f" % (mean, len([d for d in differences if d != 0]) / len(sentences))) # Pad the inputs input_ids = pad_sequences( [tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts], maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels], value=tag2idx["O"], maxlen=MAX_LEN, dtype="long", padding="post", truncating="post") # Prepare test and training data attention_masks = [[float(i > 0) for i in ii] for ii in input_ids] tr_inputs, val_inputs, tr_tags, val_tags = train_test_split( input_ids, tags, random_state=2018, test_size=0.1) tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=2018, test_size=0.1) tr_inputs = torch.tensor(tr_inputs) val_inputs = torch.tensor(val_inputs) tr_tags = torch.tensor(tr_tags) val_tags = torch.tensor(val_tags) tr_masks = torch.tensor(tr_masks) val_masks = torch.tensor(val_masks) # training will be shuffled train_data = TensorDataset(tr_inputs, tr_masks, tr_tags) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs) # test data will be given sequentially valid_data = TensorDataset(val_inputs, val_masks, val_tags) valid_sampler = SequentialSampler(valid_data) valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs) # load the model and send params to gpu if available model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(tag2idx)) if device.type == "cuda": model.cuda() # Paramaters for finetuning FULL_FINETUNING = True if FULL_FINETUNING: # "We also add some weight_decay as regularization to the main weight matrices." param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] else: # "If you have limited resources, you can also try to just train the linear classifier on # top of Bert and keep all other weights fixed. This will still give you a good performance." param_optimizer = list(model.classifier.named_parameters()) optimizer_grouped_parameters = [{ "params": [p for n, p in param_optimizer] }] optimizer = Adam(optimizer_grouped_parameters, lr=3e-5) # A function for finetuning def flat_accuracy(preds, labels): pred_flat = np.argmax(preds, axis=2).flatten() labels_flat = labels.flatten() return np.sum(pred_flat == labels_flat) / len(labels_flat) # RUN FINE-TUNING epochs = 5 max_grad_norm = 1.0 for _ in trange(epochs, desc="Epoch"): # TRAIN loop model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): # add batch to gpu print(batch) batch = tuple(t.to(device) for t in batch) print(batch) b_input_ids, b_input_mask, b_labels = batch # forward pass loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) # backward pass loss.backward() # track train loss tr_loss += loss.item() nb_tr_examples += b_input_ids.size(0) nb_tr_steps += 1 # gradient clipping torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm) # update parameters optimizer.step() model.zero_grad() # print train loss per epoch print("Train loss: {}".format(tr_loss / nb_tr_steps)) # VALIDATION on validation set model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 predictions, true_labels = [], [] for batch in valid_dataloader: batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask, b_labels = batch with torch.no_grad(): tmp_eval_loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() predictions.extend([list(p) for p in np.argmax(logits, axis=2)]) true_labels.append(label_ids) tmp_eval_accuracy = flat_accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += b_input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps print("Validation loss: {}".format(eval_loss)) print("Validation Accuracy: {}".format(eval_accuracy / nb_eval_steps)) pred_tags = [tags_vals[p_i] for p in predictions for p_i in p] valid_tags = [ tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i ] print("F1-Score: {}".format(f1_score(pred_tags, valid_tags))) # EVALUATION model.eval() predictions = [] true_labels = [] eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in valid_dataloader: batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask, b_labels = batch with torch.no_grad(): tmp_eval_loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = logits.detach().cpu().numpy() predictions.extend([list(p) for p in np.argmax(logits, axis=2)]) label_ids = b_labels.to('cpu').numpy() true_labels.append(label_ids) tmp_eval_accuracy = flat_accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += b_input_ids.size(0) nb_eval_steps += 1 pred_tags = [[tags_vals[p_i] for p_i in p] for p in predictions] valid_tags = [[tags_vals[l_ii] for l_ii in l_i] for l in true_labels for l_i in l] print("Validation loss: {}".format(eval_loss / nb_eval_steps)) print("Validation Accuracy: {}".format(eval_accuracy / nb_eval_steps)) print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags))) exit()
tr_inputs = torch.tensor(tr_inputs) val_inputs = torch.tensor(val_inputs) tr_tags = torch.tensor(tr_tags) val_tags = torch.tensor(val_tags) tr_masks = torch.tensor(tr_masks) val_masks = torch.tensor(val_masks) train_data = TensorDataset(tr_inputs, tr_masks, tr_tags) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs) valid_data = TensorDataset(val_inputs, val_masks, val_tags) valid_sampler = SequentialSampler(valid_data) valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs) model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(tag2idx)) model.cuda() FULL_FINETUNING = True if FULL_FINETUNING: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate':
def train(config): print('-' * 50) print('Loading pre-trained transfer model......') model = BertForTokenClassification.from_pretrained( config['model_name'], num_labels=config['tagset_size'] + 1) if config['update_model']: load_model_name = config['update_model'] print('load update model name is : ' + load_model_name) checkpoint = torch.load(load_model_name) model.load_state_dict(checkpoint['net']) model.to(config['device']) print('Load pre-trained transfer model done!') print('-' * 50) print('Deploying the training data......') train_data_loader = prepare_data(config) print('The training data is %d batches with %d batch sizes' % (len(train_data_loader), config['batch_size'])) if os.path.isfile(config['valid_file']): print('Dev process set ! Deploying the Dev data......') config['mode'] = config['mode'].replace('train', 'valid') valid_sents, valid_data_loader = prepare_data(config) print('The validation data has loaded done!') config['mode'] = config['mode'].replace('valid', 'train') else: print('the valid file ' + config['valid_file'] + ' is not exist pls check it in the config.txt') print('-' * 50) print('Train step! The model runs on ' + str(config['device'])) loss_list = dict() # train set param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=config['lr'], schedule=None) loss_function = nn.NLLLoss(ignore_index=0) best_f1 = 0.0 for epoch in range(config['epochs']): model.train() model.zero_grad() total_loss = 0 batch_step = 0 for batch, data in enumerate(train_data_loader): batch_step += 1 inputs, labels, seq_len = data batch_size = inputs.size(0) seql = inputs.size(1) - 2 segments_tensors = torch.zeros(inputs.size(0), inputs.size(1), dtype=torch.int64) logits = model(input_ids=inputs.to(config['device']), token_type_ids=segments_tensors.to( config['device']), attention_mask=None, labels=None) logits = logits[:, 1:-1, :] logits = logits.reshape(batch_size * seql, -1) logits = F.log_softmax(logits, 1) labels = labels.to(config['device']) loss = loss_function(logits, labels.view(batch_size * seql)) total_loss += float(loss) loss.backward() optimizer.step() model.zero_grad() print("\rEpoch: %d ! the process is in %d of %d ! " % (epoch + 1, batch + 1, len(train_data_loader)), end='') loss_avg = total_loss / batch_step loss_list[epoch] = loss_avg print("The loss is %f ! " % (loss_avg)) # valid process if os.path.isfile(config['valid_file']) and os.path.isfile( config['gold_file']): model.eval() with torch.no_grad(): valid_results = predict(model, valid_data_loader, config) valid_results = restore_result(valid_sents, valid_results) tmp_filename = ''.join( random.sample(string.ascii_letters + string.digits, 8)) write(valid_results, tmp_filename + '.txt') # create an empty file ftmp = open(tmp_filename + '_dict.txt', 'w', encoding='utf8') ftmp.close() res = score_shell(tmp_filename + '_dict.txt', config['gold_file'], tmp_filename + '.txt', tmp_filename + '_score.txt') if res == 0: get_score_cmd = 'grep \'F MEASURE\' ' + tmp_filename + '_score.txt' f1 = os.popen(get_score_cmd).read().replace('\n', '') print('The evaluation of epoch {} is {} !'.format( str(epoch + 1), f1)) else: print( 'The command of score failed, pls check or remove the validation step' ) os.system('rm ' + tmp_filename + '_dict.txt') os.system('rm ' + tmp_filename + '.txt') os.system('rm ' + tmp_filename + '_score.txt') # model save process if config['model_path'] and (epoch + 1) % int( config['save_model_epochs']) == 0: state = { 'net': model.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch } if config['save_model_name']: model_name = os.path.join( config['model_path'], config['save_model_name'] + '_' + str(epoch + 1) + '.pkl') else: model_name = os.path.join(config['model_path'], str(epoch + 1) + '.pkl') torch.save(state, model_name) print('The epoch %d is saved successfully, named %s !' % (epoch + 1, model_name)) print('train done!')
test_y_tensor = torch.tensor(test_output_ids).to(device) test_mask_tensor = torch.tensor(test_attention_masks).to(device) train_data = TensorDataset(train_x_tensor, train_mask_tensor, train_y_tensor) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs) test_data = TensorDataset(test_x_tensor, test_mask_tensor, test_y_tensor) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=bs) model = BertForTokenClassification.from_pretrained( "bert-base-cased", num_labels=len(label2id)).to(device) FULL_FINETUNING = False if FULL_FINETUNING: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate':
val_inputs = torch.tensor(val_inputs) tr_tags = torch.tensor(tr_tags) val_tags = torch.tensor(val_tags) tr_masks = torch.tensor(tr_masks) val_masks = torch.tensor(val_masks) train_data = TensorDataset(tr_inputs, tr_masks, tr_tags) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs) valid_data = TensorDataset(val_inputs, val_masks, val_tags) valid_sampler = SequentialSampler(valid_data) valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs) model = BertForTokenClassification.from_pretrained(bert_model, num_labels=len(tag2idx), cache_dir=utiler.cache_dir) model = model.cuda() # model = torch.nn.DataParallel(model) FULL_FINETUNING = True if FULL_FINETUNING: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, {