def __init__(self, model_path: str = HF_MODEL_PATH, no_product_labels: bool = False): if model_path == HF_MODEL_PATH: self.model = DistilBertForTokenClassification.from_pretrained( HF_MODEL_PATH) else: self.model = DistilBertForTokenClassification.from_pretrained( model_path) self.device = "cuda" if torch.cuda.is_available() else "cpu" self.tokenizer = get_tokenizer() self.label_dict = id2tag if no_product_labels == False else id2tag_no_prod self.model.to(self.device) self.model.eval()
def __init__(self, model_path, tag_path): with open(tag_path, "r") as tag_file: file_content = tag_file.read().strip() self.id_to_tag = file_content.splitlines() self.model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased', num_labels=len(self.id_to_tag)) self.model.load_state_dict(torch.load(model_path)) self.model.eval() self.tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
def __init__(self, model_path=None, use_cuda=False): if not model_path: model_path = get_model_path() if not os.path.exists(model_path): raise FileNotFoundError("Cannot find model under " + model_path) self.device = "cuda" if use_cuda and torch.cuda.is_available() else "cpu" self.model = DistilBertForTokenClassification.from_pretrained(model_path) self.model.to(self.device) self.tokenizer = DistilBertTokenizerFast.from_pretrained(model_path) self.label_map = self.get_label_map(model_path)
def retrain(epochs_per_item=2, min_to_train=5): global current_model global currently_training global new_annotation_count global labeled_data if currently_training: "skipping while model already training" return if len(labeled_data) < min_to_train: print("too few annotations to train: "+str(len(labeled_data))) return currently_training = True new_annotation_count = 0 new_model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased', num_labels=5) for epoch in range(0, epochs_per_item): print("epoch "+str(epoch)) shuffle(labeled_data) for report in labeled_data: annotations = json.loads(report[8]) report_text = report[1] train_item(new_model, annotations, report_text) eel.sleep(0.01) # allow other processes through ''' MODEL EVALUATION CODE HERE IF YOU WANT TO TEST THAT IT IS GETTING BETTER ''' current_model = new_model timestamp = re.sub('\.[0-9]*','_',str(datetime.now())).replace(" ", "_").replace("-", "").replace(":","") number_items = str(len(labeled_data)) model_path = "models/"+timestamp+number_items+".model" current_model.save_pretrained(model_path) if verbose: print("saved model to "+model_path) clean_old_models() currently_training = False
def train(dataset_path, tag_path, model_save_path): texts, tags = read_dataset(dataset_path) #train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.2, shuffle=True) with open(tag_path, "r") as tag_file: content = tag_file.read().strip() unique_tags = content.splitlines() tag_to_id = {tag: id for id, tag in enumerate(unique_tags)} tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-cased') train_encodings = tokenizer(texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True) #val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True) train_labels = encode_tags(tags, train_encodings, tag_to_id) #val_labels = encode_tags(val_tags, val_encodings, tag_to_id) train_encodings.pop( "offset_mapping") # we don't want to pass this to the model #val_encodings.pop("offset_mapping") train_dataset = NERDataset(train_encodings, train_labels) #val_dataset = NERDataset(val_encodings, val_labels) model = DistilBertForTokenClassification.from_pretrained( 'distilbert-base-cased', num_labels=len(unique_tags)) training_args = TrainingArguments( output_dir='./results', # output directory, overwrite_output_dir=True, num_train_epochs=100, # total number of training epochs per_device_train_batch_size=16, # batch size per device during training per_device_eval_batch_size=64, # batch size for evaluation warmup_steps=500, # number of warmup steps for learning rate scheduler weight_decay=0.01, # strength of weight decay logging_dir='./logs', # directory for storing logs logging_steps=10) trainer = Trainer( model= model, # the instantiated 🤗 Transformers model to be trained 💋 args=training_args, # training arguments, defined above train_dataset=train_dataset, # training dataset #eval_dataset=val_dataset # evaluation dataset ) model.train() trainer.train() #print("eval", trainer.evaluate()) trainer.save_model(model_save_path)
def __init__(self, num_labels, model_name, output_hidden_states=False, output_attentions=False, batch_first=True, use_crf=True): super(TokenDistilBERT, self).__init__() self.num_labels = num_labels self.batch_first = batch_first self.use_crf = use_crf self.tokendistilbert = DistilBertForTokenClassification.from_pretrained( model_name, num_labels=self.num_labels, output_hidden_states=output_hidden_states, output_attentions=output_attentions) if self.use_crf: self.crf = CRF(self.num_labels, batch_first=self.batch_first)
def makeMultilabelModel(self, modelName, num_labels=10, root='', **kwargs): if modelName in [ 'distilbert-base-uncased', 'distilbert2/', 'distilbert3/' ]: print(root) tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased') model = DistilBertForTokenClassification.from_pretrained( root + modelName, num_labels=num_labels, **kwargs) if modelName == 'bertweet': tokenizer = AutoTokenizer.from_pretrained('vinai/bertweet-base') model = AutoModelForTokenClassification.from_pretrained( root + "vinai/bertweet-base", num_labels=num_labels, **kwargs) if modelName == 'distilroberta-base': tokenizer = AutoTokenizer.from_pretrained('distilroberta-base', add_prefix_space=True) model = AutoModelForTokenClassification.from_pretrained( root + "distilroberta-base", num_labels=num_labels, **kwargs) if modelName == 'lstm': tokenizer = AutoTokenizer.from_pretrained( 'distilbert-base-uncased') model = LSTMTagger(128, 64, 2, tokenizer.vocab_size, num_labels) if modelName == 'albert-base-v2': tokenizer = AutoTokenizer.from_pretrained('albert-base-v2', add_prefix_space=True) model = AutoModelForTokenClassification.from_pretrained( root + "albert-base-v2", num_labels=num_labels, **kwargs) if modelName in 'squeezebert/squeezebert-uncased': tokenizer = AutoTokenizer.from_pretrained( 'squeezebert/squeezebert-uncased', add_prefix_space=True) model = AutoModelForTokenClassification.from_pretrained( root + "squeezebert/squeezebert-uncased", num_labels=num_labels, **kwargs) if modelName == 'xlnet-base-cased': tokenizer = AutoTokenizer.from_pretrained('xlnet-base-cased', add_prefix_space=True) model = AutoModelForTokenClassification.from_pretrained( root + "xlnet-base-cased", num_labels=num_labels, **kwargs) return tokenizer, model
def load_existing_model(): global current_model model_path = "" files = os.listdir('models') for file_name in files: if file_name.endswith(".model"): model_path = 'models/'+file_name if model_path != '': if verbose: print("Loading model from "+model_path) current_model = DistilBertForSequenceClassification.from_pretrained(model_path, num_labels=5) eel.sleep(0.1) # get_predictions() else: if verbose: print("Creating new uninitialized model (OK to ignore warnings)") current_model = DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased', num_labels=5)
def test_model(model_path, tag_path): with open(tag_path, "r") as tag_file: file_content = tag_file.read().strip() id_to_tag = file_content.splitlines() model = DistilBertForTokenClassification.from_pretrained( 'distilbert-base-cased', num_labels=15) model.load_state_dict(torch.load(model_path)) model.eval() tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-cased') run = True while run: print("Input sentence to test:") text = input("> ") encoded = tokenizer(text, is_split_into_words=False, return_offsets_mapping=True, padding=True, truncation=True, return_tensors="pt") output = model(encoded.input_ids, encoded.attention_mask) logits = output.logits logits_softmax = torch.nn.Softmax(dim=2)(logits).detach().cpu() entities = [] for token_index in range(logits_softmax.shape[1]): max_id = torch.argmax(logits_softmax[0, token_index, :]).numpy() max_id_value = logits_softmax[0, token_index, max_id].numpy() current_offsets = encoded.offset_mapping[0, token_index, :] if current_offsets[0] == 0 and current_offsets[1] == 0: continue word = text[current_offsets[0]:current_offsets[1]] if id_to_tag[max_id] == "O": continue entities.append((word, id_to_tag[max_id], max_id_value)) print("Found entities:") for (word, tag, conf) in entities: print(f"'{word}': {tag} (conf: {conf*100.0:.4f}%)")
val_encodings = tokenizer(validate_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True) train_labels = encode_tags(train_is_argum, train_encodings,tag2id ) val_labels = encode_tags(validate_is_argum, val_encodings,tag2id ) train_encodings.pop("offset_mapping") # we don't want to pass this to the model val_encodings.pop("offset_mapping") train_dataset = WNUTDataset(train_encodings, train_labels) val_dataset = WNUTDataset(val_encodings, val_labels) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased', num_labels=len(unique_tags)) model.to(device) model.train() train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True) optim = AdamW(model.parameters(), lr=5e-5) for epoch in range(3): for batch in train_loader: optim.zero_grad() input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) labels = batch['labels'].to(device) outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
elif args.pre_wgts == 'pubmed-full': pre_wgts = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext" elif args.pre_wgts == 'pubmed-abs': pre_wgts = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract" else: # args.pre_wgts == 'bert-base' pre_wgts = "bert-base-uncased" if args.model == 'bert': model = BertForTokenClassification.from_pretrained(pre_wgts, num_labels=n_tags) if args.model == 'bert_crf': model = BERT_CRF.from_pretrained(pre_wgts, num_labels=n_tags) if args.model == 'bert_lstm_crf': model = BERT_LSTM_CRF.from_pretrained(pre_wgts, num_labels=n_tags) if args.model == 'distil': model = DistilBertForTokenClassification.from_pretrained(pre_wgts, num_labels=n_tags) if args.model == 'distil_crf': model = Distil_CRF.from_pretrained(pre_wgts, num_labels=n_tags) model.to(device) optimizer = AdamW(model.parameters(), lr=args.lr) # Slanted triangular Learning rate scheduler total_steps = len(train_loader) * args.epochs // args.accum_step warm_steps = int(total_steps * args.warm_frac) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warm_steps, num_training_steps=total_steps) #%% Train the model if os.path.exists(args.exp_dir) == False:
def predict_spans(sentences): tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased", do_lower_case=True) # Load configuration config = DistilBertConfig.from_pretrained("distilbert-base-uncased", num_labels=3) # Load model bert_model = DistilBertForTokenClassification.from_pretrained( "distilbert-base-uncased", config=config) checkpoints = [output_dir] for checkp in checkpoints: model = BertLstmCrf( bert_model, config, num_labels=3, embedding_dim=config.hidden_size, hidden_dim=int(config.hidden_size / 2), rnn_layers=0, # rnn_dropout=config.hidden_dropout_prob, # output_dropout=config.hidden_dropout_prob, use_cuda=True, ) checkpoint = os.path.join(checkp, WEIGHTS_NAME) state_dict = torch.load(checkpoint) model.load_state_dict(state_dict, strict=False) model.to("cuda:0") del bert_model gc.collect() examples = [ InputExample(words=sentence.split(), guid=[], labels=["O" for x in sentence.split()]) for sentence in sentences ] model_type = "distilbert" max_seq_length = 256 pad_token_label_id = CrossEntropyLoss().ignore_index label_list = ["O", "B-PROP", "I-PROP"] features = convert_examples_to_features( examples, label_list, max_seq_length, tokenizer, cls_token_at_end=bool(model_type in ["xlnet"]), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=2 if model_type in ["xlnet"] else 0, sep_token=tokenizer.sep_token, sep_token_extra=bool(model_type in ["roberta"]), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=bool(model_type in ["xlnet"]), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if model_type in ["xlnet"] else 0, pad_token_label_id=pad_token_label_id, ) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long).to("cuda:0") all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long).to("cuda:0") all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long).to("cuda:0") del features gc.collect() model.eval() with torch.no_grad(): inputs = { "input_ids": all_input_ids, "attention_mask": all_input_mask, "labels": all_label_ids, } outputs = model(**inputs) _, _, predicted_tags = outputs del model gc.collect() preds = [] # Iterate through each line of text for x in range(all_input_ids.shape[0]): p = [] tokens = tokenizer.convert_ids_to_tokens(all_input_ids[x]) # JUST A REMINDER: # label "1" means start of propaganda, and "2" means the propaganda ends here. for i in range(len(tokens)): if tokens[i] == "[SEP]": break if tokens[i] == "[CLS]": continue p.append((tokens[i], predicted_tags[x][i])) preds.append(p) del all_input_ids, all_input_mask, all_label_ids gc.collect() return preds
## huggingface from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification, DistilBertForSequenceClassification import torch distil_bert = 'distilbert-base-cased' tokenizer = DistilBertTokenizerFast.from_pretrained(distil_bert, do_lower_case=False, add_special_tokens=True, max_length=256, pad_to_max_length=True) token_clf = DistilBertForTokenClassification.from_pretrained(distil_bert) sequence_clf = DistilBertForSequenceClassification.from_pretrained(distil_bert) sentence = 'Apple and Microsoft plan to form a joint venture for the development of cloud-based computing ' \ 'infrastructure.' input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 token_clf(input_ids) outputs = model(input_ids) last_hidden_states = outputs[0] test = db.sample(n=10) token_clf(tokenizer.encode_plus(sentence)) tokenizer.batch_encode_plus(test.text.to_list()) kb ## spacy def get_sequences_with_2_orgs(text, dist=150): ''' Uses spacy NER to identify organisations. If two organizations are detected within dist tokens from each other, extracts the sequence ''' # Apply the model
def train( train_data_path: str, model_save_path: str, prop_train: float = 0.8, no_product_labels: bool = False, seed: int = 9, evaluate_after_training: bool = True, eval_file_path: str = "../data/eval/eval_labeled.json", ): """ train_data_path: The path to your training data. Will be split model_save_path: The path to where your model should be saved. prop_train: The proportion of your training data to be held out for calculating the loss during training. no_product_labels: If False, removes Product tags from the training data and converts them to O's, so the model will not learn to extract Products. seed: Random seed to initialize the weights. I found good results with 9. evaluate_after_training: Whether to evaluate the model immediately after training and save the stats at `data/performance/{model_path}`. eval_file_path: Path to a custom eval file. Note this needs to be a LabelStudio-formatted JSON to work correctly. (See format of included eval file.) """ DEVICE = "cuda" if torch.cuda.is_available() else "cpu" with open(train_data_path) as f: data = f.read() train_encodings, train_labels, val_encodings, val_labels = preprocess_bio_data( data, prop_train=prop_train, no_product_labels=no_product_labels) train_dataset = TokenClassificationDataset(train_encodings, train_labels) val_dataset = TokenClassificationDataset(val_encodings, val_labels) if no_product_labels: train_dataset.unique_tags = ["B-Ingredient", "I-Ingredient", "O"] val_dataset.unique_tags = ["B-Ingredient", "I-Ingredient", "O"] model = DistilBertForTokenClassification.from_pretrained( "distilbert-base-cased", num_labels=len(train_dataset.unique_tags)) model.to(DEVICE) training_args = TrainingArguments( output_dir=model_save_path, num_train_epochs=7, # total number of training epochs per_device_train_batch_size=32, # batch size per device during training per_device_eval_batch_size=16, # batch size for evaluation do_eval=True, evaluate_during_training=True, eval_steps=10, warmup_steps=50, weight_decay=0.01, # strength of weight decay overwrite_output_dir=True, seed=seed, ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, ) trainer.train() trainer.save_model(model_save_path) # Runs evaluation and saves a bunch of stats if evaluate_after_training: evaluate_model( model_save_path, eval_file_path=eval_file_path, no_product_labels=no_product_labels, ) print("Model has been evaluated. Results are available at " f"../data/performance/{model_save_path.split('/')[-1]}.")
train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = bs) valid_data = TensorDataset(val_inputs, val_masks, val_tags) valid_sampler = SequentialSampler(valid_data) valid_dataloader = DataLoader(valid_data, sampler = valid_sampler, batch_size = bs) test_data = TensorDataset(te_inputs, te_masks, te_tags) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler = test_sampler, batch_size = bs) #config = DistilBertConfig.from_pretrained("distillbert_ner_c_model_save") #model = DistillBertTagger(config = config) model = DistilBertForTokenClassification.from_pretrained( "distilbert-base-uncased", num_labels=len(tag2idx), output_attentions = False, output_hidden_states = False ) #model.config.num_labels = len(tag2idx) #model.classifier = nn.Linear(768, len(tag2idx)) #model.resize_token_embeddings(len(tokenizer)) model.to(device) FULL_FINETUNING = False if FULL_FINETUNING: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
def evaluate(dataset_path, model_weights_path, tag_path): with open(tag_path, "r") as tag_file: file_content = tag_file.read().strip() id_to_tag = file_content.splitlines() model = DistilBertForTokenClassification.from_pretrained( 'distilbert-base-cased', num_labels=len(id_to_tag)) model.load_state_dict(torch.load(model_weights_path)) model.eval() texts, tags = read_dataset(dataset_path) unique_tags = file_content.splitlines() tag_to_id = {tag: id for id, tag in enumerate(unique_tags)} tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-cased') encodings = tokenizer(texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True) labels = encode_tags(tags, encodings, tag_to_id) encodings.pop("offset_mapping") # we don't want to pass this to the model dataset = NERDataset(encodings, labels) dataloader = torch.utils.data.DataLoader(dataset, batch_size=128, num_workers=4) predictions = None labels = None for batch in dataloader: result = model(batch["input_ids"], batch["attention_mask"]) if predictions is None: predictions = result.logits.detach().cpu() labels = batch["labels"].detach().cpu() else: predictions = torch.cat( (predictions, result.logits.detach().cpu()), dim=0) labels = torch.cat((labels, batch["labels"].detach().cpu()), dim=0) predictions_softmax = torch.nn.Softmax(dim=2)(predictions) labels = labels.numpy() tp = 0 fp = 0 fn = 0 tn = 0 total_predictions = 0 for sentence_labels, sentence_predictions in zip(labels, predictions_softmax): max_ids = torch.argmax(sentence_predictions, dim=1).numpy() for label, prediction in zip(sentence_labels, max_ids): if label == -100: continue total_predictions += 1 if label == prediction and label != 0: # TP tp += 1 elif label != prediction and prediction != 0: # FP fp += 1 elif label != prediction and prediction == 0 and label != 0: # FN fn += 1 elif prediction == 0 and label == 0: tn += 1 else: raise Exception("This should not happen, check your code") precision = tp / (tp + fp) recall = tp / (tp + fn) accuracy = (tp + tn) / total_predictions f1 = 2 * (precision * recall) / (precision + recall) print( f"Test results:\n\n\tPrecision: {precision:.6f}\n\tRecall: {recall:.6f}\n\tF1 Score: {f1:.6f}\n\tAccuracy: {accuracy*100:.3f}%" )