Пример #1
0
    def __init__(self,
                 model_path: str = HF_MODEL_PATH,
                 no_product_labels: bool = False):

        if model_path == HF_MODEL_PATH:
            self.model = DistilBertForTokenClassification.from_pretrained(
                HF_MODEL_PATH)
        else:
            self.model = DistilBertForTokenClassification.from_pretrained(
                model_path)
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = get_tokenizer()
        self.label_dict = id2tag if no_product_labels == False else id2tag_no_prod

        self.model.to(self.device)
        self.model.eval()
Пример #2
0
 def load_model(self, num_labels: int = None):
     model = DistilBertForTokenClassification(self.config).from_pretrained(
         "distilbert-base-uncased",
         num_labels=num_labels,
         label2id=self.params.tag2id,
         id2label=self.params.id2tag)
     return model
Пример #3
0
 def load_model(self, model_name: str = "bert_ner_test"):
     # TODO model loaded from mlflow
     # Load model and tokenizer.
     config = DistilBertConfig.from_pretrained(model_name)
     model = DistilBertForTokenClassification(config).from_pretrained(
         model_name)
     tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
     return model, config, tokenizer
Пример #4
0
 def __init__(self, model_path, tag_path):
     with open(tag_path, "r") as tag_file:
         file_content = tag_file.read().strip()
         self.id_to_tag = file_content.splitlines()
     self.model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased', num_labels=len(self.id_to_tag))
     self.model.load_state_dict(torch.load(model_path))
     self.model.eval()
     self.tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
        def create_and_check_distilbert_for_token_classification(
                self, config, input_ids, input_mask, sequence_labels,
                token_labels, choice_labels):
            config.num_labels = self.num_labels
            model = DistilBertForTokenClassification(config=config)
            model.eval()

            loss, logits = model(input_ids,
                                 attention_mask=input_mask,
                                 labels=token_labels)
            result = {
                "loss": loss,
                "logits": logits,
            }
            self.parent.assertListEqual(
                list(result["logits"].size()),
                [self.batch_size, self.seq_length, self.num_labels])
            self.check_loss_output(result)
Пример #6
0
 def __init__(self, model_path=None, use_cuda=False):
     if not model_path:
         model_path = get_model_path()
     if not os.path.exists(model_path):
         raise FileNotFoundError("Cannot find model under " + model_path)
     self.device = "cuda" if use_cuda and torch.cuda.is_available() else "cpu"
     self.model = DistilBertForTokenClassification.from_pretrained(model_path)
     self.model.to(self.device)
     self.tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)
     self.label_map = self.get_label_map(model_path)
Пример #7
0
def retrain(epochs_per_item=2, min_to_train=5):
    global current_model
    global currently_training
    global new_annotation_count
    global labeled_data
    

    if currently_training:
        "skipping while model already training"
        return
    
    if len(labeled_data) < min_to_train:
        print("too few annotations to train: "+str(len(labeled_data)))        
        return
        
    currently_training = True
    new_annotation_count = 0
    
    new_model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased', num_labels=5)

        
    for epoch in range(0, epochs_per_item):
        print("epoch "+str(epoch))
        shuffle(labeled_data) 
        for report in labeled_data:
            annotations = json.loads(report[8])
            report_text = report[1]
            train_item(new_model, annotations, report_text)
         

            eel.sleep(0.01) # allow other processes through

 
    '''
    MODEL EVALUATION CODE HERE IF YOU WANT TO TEST THAT IT IS GETTING BETTER    
    '''
    
    current_model = new_model
    
    timestamp = re.sub('\.[0-9]*','_',str(datetime.now())).replace(" ", "_").replace("-", "").replace(":","")
    number_items = str(len(labeled_data))              
                     
    model_path = "models/"+timestamp+number_items+".model"
    current_model.save_pretrained(model_path)
    if verbose:
        print("saved model to "+model_path)
    clean_old_models()
    
        
    currently_training = False
Пример #8
0
        def create_and_check_distilbert_for_token_classification(
            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
        ):
            config.num_labels = self.num_labels
            model = DistilBertForTokenClassification(config=config)
            model.to(torch_device)
            model.eval()

            result = model(input_ids, attention_mask=input_mask, labels=token_labels)
            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
Пример #9
0
def train(dataset_path, tag_path, model_save_path):
    texts, tags = read_dataset(dataset_path)
    #train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.2, shuffle=True)
    with open(tag_path, "r") as tag_file:
        content = tag_file.read().strip()
        unique_tags = content.splitlines()
    tag_to_id = {tag: id for id, tag in enumerate(unique_tags)}
    tokenizer = DistilBertTokenizerFast.from_pretrained(
        'distilbert-base-cased')
    train_encodings = tokenizer(texts,
                                is_split_into_words=True,
                                return_offsets_mapping=True,
                                padding=True,
                                truncation=True)
    #val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
    train_labels = encode_tags(tags, train_encodings, tag_to_id)
    #val_labels = encode_tags(val_tags, val_encodings, tag_to_id)
    train_encodings.pop(
        "offset_mapping")  # we don't want to pass this to the model
    #val_encodings.pop("offset_mapping")
    train_dataset = NERDataset(train_encodings, train_labels)
    #val_dataset = NERDataset(val_encodings, val_labels)
    model = DistilBertForTokenClassification.from_pretrained(
        'distilbert-base-cased', num_labels=len(unique_tags))

    training_args = TrainingArguments(
        output_dir='./results',  # output directory,
        overwrite_output_dir=True,
        num_train_epochs=100,  # total number of training epochs
        per_device_train_batch_size=16,  # batch size per device during training
        per_device_eval_batch_size=64,  # batch size for evaluation
        warmup_steps=500,  # number of warmup steps for learning rate scheduler
        weight_decay=0.01,  # strength of weight decay
        logging_dir='./logs',  # directory for storing logs
        logging_steps=10)

    trainer = Trainer(
        model=
        model,  # the instantiated 🤗 Transformers model to be trained 💋
        args=training_args,  # training arguments, defined above
        train_dataset=train_dataset,  # training dataset
        #eval_dataset=val_dataset             # evaluation dataset
    )
    model.train()
    trainer.train()
    #print("eval", trainer.evaluate())
    trainer.save_model(model_save_path)
Пример #10
0
 def __init__(self,
              num_labels,
              model_name,
              output_hidden_states=False,
              output_attentions=False,
              batch_first=True,
              use_crf=True):
     super(TokenDistilBERT, self).__init__()
     self.num_labels = num_labels
     self.batch_first = batch_first
     self.use_crf = use_crf
     self.tokendistilbert = DistilBertForTokenClassification.from_pretrained(
         model_name,
         num_labels=self.num_labels,
         output_hidden_states=output_hidden_states,
         output_attentions=output_attentions)
     if self.use_crf:
         self.crf = CRF(self.num_labels, batch_first=self.batch_first)
    def makeMultilabelModel(self, modelName, num_labels=10, root='', **kwargs):
        if modelName in [
                'distilbert-base-uncased', 'distilbert2/', 'distilbert3/'
        ]:
            print(root)
            tokenizer = DistilBertTokenizerFast.from_pretrained(
                'distilbert-base-uncased')
            model = DistilBertForTokenClassification.from_pretrained(
                root + modelName, num_labels=num_labels, **kwargs)
        if modelName == 'bertweet':
            tokenizer = AutoTokenizer.from_pretrained('vinai/bertweet-base')
            model = AutoModelForTokenClassification.from_pretrained(
                root + "vinai/bertweet-base", num_labels=num_labels, **kwargs)
        if modelName == 'distilroberta-base':
            tokenizer = AutoTokenizer.from_pretrained('distilroberta-base',
                                                      add_prefix_space=True)
            model = AutoModelForTokenClassification.from_pretrained(
                root + "distilroberta-base", num_labels=num_labels, **kwargs)
        if modelName == 'lstm':
            tokenizer = AutoTokenizer.from_pretrained(
                'distilbert-base-uncased')
            model = LSTMTagger(128, 64, 2, tokenizer.vocab_size, num_labels)
        if modelName == 'albert-base-v2':
            tokenizer = AutoTokenizer.from_pretrained('albert-base-v2',
                                                      add_prefix_space=True)
            model = AutoModelForTokenClassification.from_pretrained(
                root + "albert-base-v2", num_labels=num_labels, **kwargs)
        if modelName in 'squeezebert/squeezebert-uncased':
            tokenizer = AutoTokenizer.from_pretrained(
                'squeezebert/squeezebert-uncased', add_prefix_space=True)
            model = AutoModelForTokenClassification.from_pretrained(
                root + "squeezebert/squeezebert-uncased",
                num_labels=num_labels,
                **kwargs)
        if modelName == 'xlnet-base-cased':
            tokenizer = AutoTokenizer.from_pretrained('xlnet-base-cased',
                                                      add_prefix_space=True)
            model = AutoModelForTokenClassification.from_pretrained(
                root + "xlnet-base-cased", num_labels=num_labels, **kwargs)

        return tokenizer, model
Пример #12
0
def load_existing_model():
    global current_model 

    model_path = ""
    
    files = os.listdir('models') 
    for file_name in files:
        if file_name.endswith(".model"):
            model_path = 'models/'+file_name
                
    if model_path != '':    
        if verbose:
            print("Loading model from "+model_path)
        current_model = DistilBertForSequenceClassification.from_pretrained(model_path, num_labels=5)
        eel.sleep(0.1)
        # get_predictions()
    else:
        if verbose:
            print("Creating new uninitialized model (OK to ignore warnings)")

        current_model = DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased', num_labels=5)
Пример #13
0
def test_model(model_path, tag_path):
    with open(tag_path, "r") as tag_file:
        file_content = tag_file.read().strip()
        id_to_tag = file_content.splitlines()
    model = DistilBertForTokenClassification.from_pretrained(
        'distilbert-base-cased', num_labels=15)
    model.load_state_dict(torch.load(model_path))
    model.eval()
    tokenizer = DistilBertTokenizerFast.from_pretrained(
        'distilbert-base-cased')
    run = True
    while run:
        print("Input sentence to test:")
        text = input("> ")
        encoded = tokenizer(text,
                            is_split_into_words=False,
                            return_offsets_mapping=True,
                            padding=True,
                            truncation=True,
                            return_tensors="pt")
        output = model(encoded.input_ids, encoded.attention_mask)
        logits = output.logits
        logits_softmax = torch.nn.Softmax(dim=2)(logits).detach().cpu()
        entities = []
        for token_index in range(logits_softmax.shape[1]):
            max_id = torch.argmax(logits_softmax[0, token_index, :]).numpy()
            max_id_value = logits_softmax[0, token_index, max_id].numpy()
            current_offsets = encoded.offset_mapping[0, token_index, :]
            if current_offsets[0] == 0 and current_offsets[1] == 0:
                continue
            word = text[current_offsets[0]:current_offsets[1]]
            if id_to_tag[max_id] == "O":
                continue
            entities.append((word, id_to_tag[max_id], max_id_value))
        print("Found entities:")
        for (word, tag, conf) in entities:
            print(f"'{word}': {tag} (conf: {conf*100.0:.4f}%)")
Пример #14
0
def predict_spans(sentences):
    tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased",
                                                    do_lower_case=True)

    # Load configuration
    config = DistilBertConfig.from_pretrained("distilbert-base-uncased",
                                              num_labels=3)

    # Load model
    bert_model = DistilBertForTokenClassification.from_pretrained(
        "distilbert-base-uncased", config=config)

    checkpoints = [output_dir]
    for checkp in checkpoints:
        model = BertLstmCrf(
            bert_model,
            config,
            num_labels=3,
            embedding_dim=config.hidden_size,
            hidden_dim=int(config.hidden_size / 2),
            rnn_layers=0,
            # rnn_dropout=config.hidden_dropout_prob,
            # output_dropout=config.hidden_dropout_prob,
            use_cuda=True,
        )
        checkpoint = os.path.join(checkp, WEIGHTS_NAME)
        state_dict = torch.load(checkpoint)
        model.load_state_dict(state_dict, strict=False)
        model.to("cuda:0")

    del bert_model
    gc.collect()

    examples = [
        InputExample(words=sentence.split(),
                     guid=[],
                     labels=["O" for x in sentence.split()])
        for sentence in sentences
    ]
    model_type = "distilbert"
    max_seq_length = 256
    pad_token_label_id = CrossEntropyLoss().ignore_index
    label_list = ["O", "B-PROP", "I-PROP"]
    features = convert_examples_to_features(
        examples,
        label_list,
        max_seq_length,
        tokenizer,
        cls_token_at_end=bool(model_type in ["xlnet"]),
        # xlnet has a cls token at the end
        cls_token=tokenizer.cls_token,
        cls_token_segment_id=2 if model_type in ["xlnet"] else 0,
        sep_token=tokenizer.sep_token,
        sep_token_extra=bool(model_type in ["roberta"]),
        # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
        pad_on_left=bool(model_type in ["xlnet"]),
        # pad on the left for xlnet
        pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
        pad_token_segment_id=4 if model_type in ["xlnet"] else 0,
        pad_token_label_id=pad_token_label_id,
    )

    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long).to("cuda:0")
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long).to("cuda:0")
    all_label_ids = torch.tensor([f.label_ids for f in features],
                                 dtype=torch.long).to("cuda:0")

    del features
    gc.collect()

    model.eval()
    with torch.no_grad():
        inputs = {
            "input_ids": all_input_ids,
            "attention_mask": all_input_mask,
            "labels": all_label_ids,
        }
        outputs = model(**inputs)
        _, _, predicted_tags = outputs

    del model
    gc.collect()

    preds = []
    # Iterate through each line of text
    for x in range(all_input_ids.shape[0]):
        p = []
        tokens = tokenizer.convert_ids_to_tokens(all_input_ids[x])

        # JUST A REMINDER:
        #  label "1" means start of propaganda, and "2" means the propaganda ends here.
        for i in range(len(tokens)):
            if tokens[i] == "[SEP]":
                break
            if tokens[i] == "[CLS]":
                continue
            p.append((tokens[i], predicted_tags[x][i]))

        preds.append(p)

    del all_input_ids, all_input_mask, all_label_ids
    gc.collect()

    return preds
Пример #15
0
train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler = valid_sampler, batch_size = bs)

test_data = TensorDataset(te_inputs, te_masks, te_tags)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler = test_sampler, batch_size = bs)

#config = DistilBertConfig.from_pretrained("distillbert_ner_c_model_save")
#model = DistillBertTagger(config = config)

model = DistilBertForTokenClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False
)
#model.config.num_labels = len(tag2idx)
#model.classifier = nn.Linear(768, len(tag2idx))
#model.resize_token_embeddings(len(tokenizer))
model.to(device)

FULL_FINETUNING = False
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
Пример #16
0
def train(
    train_data_path: str,
    model_save_path: str,
    prop_train: float = 0.8,
    no_product_labels: bool = False,
    seed: int = 9,
    evaluate_after_training: bool = True,
    eval_file_path: str = "../data/eval/eval_labeled.json",
):
    """
    train_data_path: The path to your training data. Will be split 
    model_save_path: The path to where your model should be saved.
    prop_train: The proportion of your training data to be held out for 
    calculating the loss during training.
    no_product_labels: If False, removes Product tags from the training data
    and converts them to O's, so the model will not learn to extract Products.
    seed: Random seed to initialize the weights. I found good results with 9.
    evaluate_after_training: Whether to evaluate the model immediately after
    training and save the stats at `data/performance/{model_path}`.
    eval_file_path: Path to a custom eval file. Note this needs to be a 
    LabelStudio-formatted JSON to work correctly. (See format of included 
    eval file.)
    """

    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

    with open(train_data_path) as f:
        data = f.read()

    train_encodings, train_labels, val_encodings, val_labels = preprocess_bio_data(
        data, prop_train=prop_train, no_product_labels=no_product_labels)
    train_dataset = TokenClassificationDataset(train_encodings, train_labels)
    val_dataset = TokenClassificationDataset(val_encodings, val_labels)

    if no_product_labels:
        train_dataset.unique_tags = ["B-Ingredient", "I-Ingredient", "O"]
        val_dataset.unique_tags = ["B-Ingredient", "I-Ingredient", "O"]

    model = DistilBertForTokenClassification.from_pretrained(
        "distilbert-base-cased", num_labels=len(train_dataset.unique_tags))
    model.to(DEVICE)

    training_args = TrainingArguments(
        output_dir=model_save_path,
        num_train_epochs=7,  # total number of training epochs
        per_device_train_batch_size=32,  # batch size per device during training
        per_device_eval_batch_size=16,  # batch size for evaluation
        do_eval=True,
        evaluate_during_training=True,
        eval_steps=10,
        warmup_steps=50,
        weight_decay=0.01,  # strength of weight decay
        overwrite_output_dir=True,
        seed=seed,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )

    trainer.train()
    trainer.save_model(model_save_path)

    # Runs evaluation and saves a bunch of stats
    if evaluate_after_training:
        evaluate_model(
            model_save_path,
            eval_file_path=eval_file_path,
            no_product_labels=no_product_labels,
        )
        print("Model has been evaluated. Results are available at "
              f"../data/performance/{model_save_path.split('/')[-1]}.")
Пример #17
0
val_encodings = tokenizer(validate_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)



train_labels = encode_tags(train_is_argum, train_encodings,tag2id )
val_labels = encode_tags(validate_is_argum, val_encodings,tag2id )


train_encodings.pop("offset_mapping") # we don't want to pass this to the model
val_encodings.pop("offset_mapping")
train_dataset = WNUTDataset(train_encodings, train_labels)
val_dataset = WNUTDataset(val_encodings, val_labels)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased', num_labels=len(unique_tags))

model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
Пример #18
0
elif args.pre_wgts == 'pubmed-full':
    pre_wgts = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
elif args.pre_wgts == 'pubmed-abs':
    pre_wgts = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"
else:  # args.pre_wgts == 'bert-base'
    pre_wgts = "bert-base-uncased"

if args.model == 'bert':
    model = BertForTokenClassification.from_pretrained(pre_wgts,
                                                       num_labels=n_tags)
if args.model == 'bert_crf':
    model = BERT_CRF.from_pretrained(pre_wgts, num_labels=n_tags)
if args.model == 'bert_lstm_crf':
    model = BERT_LSTM_CRF.from_pretrained(pre_wgts, num_labels=n_tags)
if args.model == 'distil':
    model = DistilBertForTokenClassification.from_pretrained(pre_wgts,
                                                             num_labels=n_tags)
if args.model == 'distil_crf':
    model = Distil_CRF.from_pretrained(pre_wgts, num_labels=n_tags)

model.to(device)
optimizer = AdamW(model.parameters(), lr=args.lr)

# Slanted triangular Learning rate scheduler
total_steps = len(train_loader) * args.epochs // args.accum_step
warm_steps = int(total_steps * args.warm_frac)
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=warm_steps,
                                            num_training_steps=total_steps)

#%% Train the model
if os.path.exists(args.exp_dir) == False:
Пример #19
0
def evaluate(dataset_path, model_weights_path, tag_path):
    with open(tag_path, "r") as tag_file:
        file_content = tag_file.read().strip()
    id_to_tag = file_content.splitlines()
    model = DistilBertForTokenClassification.from_pretrained(
        'distilbert-base-cased', num_labels=len(id_to_tag))
    model.load_state_dict(torch.load(model_weights_path))
    model.eval()
    texts, tags = read_dataset(dataset_path)
    unique_tags = file_content.splitlines()
    tag_to_id = {tag: id for id, tag in enumerate(unique_tags)}
    tokenizer = DistilBertTokenizerFast.from_pretrained(
        'distilbert-base-cased')
    encodings = tokenizer(texts,
                          is_split_into_words=True,
                          return_offsets_mapping=True,
                          padding=True,
                          truncation=True)
    labels = encode_tags(tags, encodings, tag_to_id)
    encodings.pop("offset_mapping")  # we don't want to pass this to the model
    dataset = NERDataset(encodings, labels)
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=128,
                                             num_workers=4)

    predictions = None
    labels = None
    for batch in dataloader:
        result = model(batch["input_ids"], batch["attention_mask"])
        if predictions is None:
            predictions = result.logits.detach().cpu()
            labels = batch["labels"].detach().cpu()
        else:
            predictions = torch.cat(
                (predictions, result.logits.detach().cpu()), dim=0)
            labels = torch.cat((labels, batch["labels"].detach().cpu()), dim=0)

    predictions_softmax = torch.nn.Softmax(dim=2)(predictions)
    labels = labels.numpy()

    tp = 0
    fp = 0
    fn = 0
    tn = 0
    total_predictions = 0
    for sentence_labels, sentence_predictions in zip(labels,
                                                     predictions_softmax):
        max_ids = torch.argmax(sentence_predictions, dim=1).numpy()
        for label, prediction in zip(sentence_labels, max_ids):
            if label == -100:
                continue
            total_predictions += 1
            if label == prediction and label != 0:  # TP
                tp += 1
            elif label != prediction and prediction != 0:  # FP
                fp += 1
            elif label != prediction and prediction == 0 and label != 0:  # FN
                fn += 1
            elif prediction == 0 and label == 0:
                tn += 1
            else:
                raise Exception("This should not happen, check your code")

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    accuracy = (tp + tn) / total_predictions
    f1 = 2 * (precision * recall) / (precision + recall)

    print(
        f"Test results:\n\n\tPrecision: {precision:.6f}\n\tRecall: {recall:.6f}\n\tF1 Score: {f1:.6f}\n\tAccuracy: {accuracy*100:.3f}%"
    )
## huggingface
from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification, DistilBertForSequenceClassification
import torch

distil_bert = 'distilbert-base-cased'
tokenizer = DistilBertTokenizerFast.from_pretrained(distil_bert, do_lower_case=False, add_special_tokens=True,
                                                max_length=256, pad_to_max_length=True)
token_clf = DistilBertForTokenClassification.from_pretrained(distil_bert)
sequence_clf = DistilBertForSequenceClassification.from_pretrained(distil_bert)

sentence = 'Apple and Microsoft plan to form a joint venture for the development of cloud-based computing ' \
           'infrastructure.'

input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
token_clf(input_ids)
outputs = model(input_ids)

last_hidden_states = outputs[0]

test = db.sample(n=10)

token_clf(tokenizer.encode_plus(sentence))
tokenizer.batch_encode_plus(test.text.to_list())

kb
## spacy
def get_sequences_with_2_orgs(text, dist=150):
    ''' Uses spacy NER to identify organisations. If two organizations are detected within dist
    tokens from each other, extracts the sequence
    '''
    # Apply the model