def model_trainer(model_path, train_dataset, test_dataset): # model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels =4) model = RobertaForTokenClassification.from_pretrained('roberta-base', num_labels = 3, return_dict=True) training_args = TrainingArguments( output_dir=model_path, # output directory num_train_epochs=1, # total # of training epochs per_device_train_batch_size=16, # batch size per device during training per_device_eval_batch_size=16, # batch size for evaluation # warmup_steps=0, # number of warmup steps for learning rate scheduler weight_decay=0.1, # strength of weight decay logging_dir=os.path.join(model_path, 'logs'), learning_rate= 5e-5, # directory for storing logs logging_steps=1000, save_steps = 2700, # save_model = os.path.join(model_path, 'final_model') ) trainer = Trainer( model=model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=train_dataset, # training dataset eval_dataset=test_dataset, # evaluation dataset compute_metrics = compute_metrics, ) return trainer, model
def __init__(self, use_gpu=True, tokenizer=None): super().__init__() MODEL_NAME = 'iarfmoose/roberta-small-bulgarian-pos' if tokenizer: self.tokenizer = tokenizer else: self.tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME) self.model = RobertaForTokenClassification.from_pretrained(MODEL_NAME) self.model.to(self.device) self.tag_to_id = { 'ADJ': 0, 'ADP': 1, 'PUNCT': 2, 'ADV': 3, 'AUX': 4, 'SYM': 5, 'INTJ': 6, 'CCONJ': 7, 'X': 8, 'NOUN': 9, 'DET': 10, 'PROPN': 11, 'NUM': 12, 'VERB': 13, 'PART': 14, 'PRON': 15, 'SCONJ': 16 } self.id_to_tag = {self.tag_to_id[tag]: tag for tag in self.tag_to_id}
def __init__(self, use_gpu=True, tokenizer=None): super().__init__() MODEL_NAME = 'iarfmoose/roberta-small-bulgarian-ner' if tokenizer: self.tokenizer = tokenizer else: self.tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME) self.model = RobertaForTokenClassification.from_pretrained(MODEL_NAME) self.model.to(self.device) self.tag_to_id = { 'O': 0, 'I-PRO': 1, 'I-PER': 2, 'I-ORG': 3, 'I-LOC': 4, 'I-EVT': 5, 'B-PRO': 6, 'B-PER': 7, 'B-ORG': 8, 'B-LOC': 9, 'B-EVT': 10 } self.id_to_tag = {self.tag_to_id[tag]: tag for tag in self.tag_to_id}
def __init__(self, tokenizer): super(UpperSentDetectorModel, self).__init__() self.tokenizer = tokenizer self.transformer = \ RobertaForTokenClassification.from_pretrained('roberta-base') self.transformer.to(device) self.softmax = torch.nn.Softmax(dim=2) self.threshold = 0.9 # may be changed, if you need i
def create_and_check_roberta_for_token_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): config.num_labels = self.num_labels model = RobertaForTokenClassification(config=config) model.eval() loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) result = { "loss": loss, "logits": logits, } self.parent.assertListEqual( list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]) self.check_loss_output(result)
def model_fn(model_dir): print("Loading model.") from transformers import RobertaForTokenClassification device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = RobertaForTokenClassification.from_pretrained('roberta-base', num_labels=20) with open(os.path.join(model_dir, 'model.pth'), 'rb') as f: model.load_state_dict(torch.load(f)) return model.to(device)
def create_and_check_for_token_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): config.num_labels = self.num_labels model = RobertaForTokenClassification(config=config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
def get_model(): # Load model config = RobertaConfig.from_pretrained( "BERTweet_base_transformers/config.json", num_labels=3) BERTweet = RobertaForTokenClassification.from_pretrained( "BERTweet_base_transformers/model.bin", config=config) optimizer = AdamW( BERTweet.parameters(), lr=1e-05, # args.learning_rate - default is 5e-5, eps=1e-8 # args.adam_epsilon - default is 1e-8. ) return BERTweet, optimizer
def sentenceLabel(sentence): f = open('./model_save/tag2idx.pckl', 'rb') tag2idx = pickle.load(f) device = torch.device("cpu") output_dir = './model_save/' idx2tag = dict((v, k) for k, v in tag2idx.items()) tokenizer = RobertaTokenizer.from_pretrained(output_dir) model = RobertaForTokenClassification.from_pretrained(output_dir) model.to(device) # predict all_tokens = [] all_entities = [] tokenized_sentence = tokenizer.encode(sentence) input_ids = torch.tensor([tokenized_sentence]).to(device) predictions = [] with torch.no_grad(): output = model(input_ids) output = output[0].detach().cpu().numpy() predictions.extend([list(p) for p in np.argmax(output, axis=2)]) tags_predictions = [] for x in predictions[0]: tags_predictions.append(idx2tag[int(x)]) tokens = [] count = 0 ### get tokens from ids for x in tokenizer.convert_ids_to_tokens(tokenized_sentence): if count == 1: tokens.append(x) else: tokens.append(x[1:]) count += 1 all_entities.append(tags_predictions[1:-1]) all_tokens.append(tokens[1:-1]) #print(all_tokens) #print(all_entities) return all_tokens, all_entities
def train(self): if self.has_started(): last_checkpoint = self.get_latest_checkpoint() logger.info(f"Resuming training from: {last_checkpoint}") model = AutoModelForTokenClassification.from_pretrained( last_checkpoint, config=self.config) else: model = RobertaForTokenClassification.from_pretrained( "neurocode/IsRoBERTa", config=self.config) trainer = Trainer(model=model, args=self.training_args, train_dataset=self.dataset) trainer.train() trainer.save_model(f"{self.model_dir}") self.upload()
def model_trainer(args, test_dataset): # model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels =4) model = RobertaForTokenClassification.from_pretrained(args.model_path, num_labels=3, return_dict=True) #/anfs/bigdisc/rmya2/faiss_data/results_table_to_cell2/checkpoint-1400/' training_args = TrainingArguments( per_device_train_batch_size=16, # batch size per device during training per_device_eval_batch_size=16, # batch size for evaluation # warmup_steps=0, # number of warmup steps for learning rate scheduler logging_dir='./logs', output_dir='./model_output') trainer = Trainer( model=model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above eval_dataset=test_dataset, # evaluation dataset compute_metrics=compute_metrics, ) return trainer, model
import pickle f = open('tags_vals.pckl', 'rb') tags_vals = pickle.load(f) f.close() print(tags_vals) f = open('tag2idx.pckl', 'rb') tag2idx = pickle.load(f) f.close() idx2tag = dict((v, k) for k, v in tag2idx.items()) device = torch.device("cpu") output_dir = './roberta_few_labels/' tokenizer = RobertaTokenizer.from_pretrained(output_dir) model = RobertaForTokenClassification.from_pretrained(output_dir) model.to(device) text = 'O, B-Diagnostic_procedure, I-Diagnostic_procedure,B-Biological_structure, I-Biological_structure, B-Sign_symptom, I-Sign_symptom, B-Detailed_description, I-Detailed_description, B-Lab_value, I-Lab_value, B-Date, I-Date, B-Age, I-Age, B-Clinical_event, I-Clinical_event, B-Date, I-Date, B-Disease_disorder, I-Disease_disorder, B-Nonbiological_location, I-Nonbiological_location, B-Severity, I-Severity, B-Sex, B-Therapeutic_procedure, I-Therapeutic_procedure' tag_values = text.split(',') print(tag_values) query = "a woman aged 65 has a fever and a cough on march at a hospital" tokenized_sentence = tokenizer.encode(query) print(tokenized_sentence) input_ids = torch.tensor([tokenized_sentence]).to(device) print(input_ids) predictions = [] with torch.no_grad(): output = model(input_ids)
test_data = TensorDataset(test_inputs, test_masks, test_tags) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=bs) if args.pretrained_model == 'bert-base-cased': model = BertForTokenClassification.from_pretrained( args.pretrained_model, num_labels=len(tag2idx), output_attentions=False, output_hidden_states=False) if args.pretrained_model == 'bert-base-cased-crf': model = bertCRF(num_classes=len(tag2idx), model_name=args.pretrained_model) if args.pretrained_model == 'roberta-base': model = RobertaForTokenClassification.from_pretrained( args.pretrained_model, num_labels=len(tag2idx), output_attentions=False, output_hidden_states=False) if args.pretrained_model == 'roberta-base-crf': model = bertCRF(num_classes=len(tag2idx), model_name=args.pretrained_model) model.cuda() param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params':
parser.add_argument( "--use_goodreads", help="calculate citation graph with external goodreads database", action='store_true') parser.add_argument( "--use_citation_model", help= "wheter to use NER model to remove false positives from detected citations", action='store_true') args = parser.parse_args() metadata_to_use = 'goodreads' if args.use_goodreads else 'calibre' model = RobertaForTokenClassification.from_pretrained( 'fine-tuned-model-ner-better-data-3') tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model.share_memory() bkp = BookProcesserFactory(create_dataset=True, verbose=False, use_citation_model=args.use_citation_model, metadata_to_use=metadata_to_use) book_processer = bkp.GetProcessFunction() if args.use_goodreads: # if it doesn't find the file read the script first without the --use_goodreads argument G = pickle.load(open("pickled_graphs/small_graph.p", "rb")) else:
tr_inputs = torch.tensor(tr_inputs) val_inputs = torch.tensor(val_inputs) tr_tags = torch.tensor(tr_tags) val_tags = torch.tensor(val_tags) tr_masks = torch.tensor(tr_masks) val_masks = torch.tensor(val_masks) train_data = TensorDataset(tr_inputs, tr_masks, tr_tags) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs) valid_data = TensorDataset(val_inputs, val_masks, val_tags) valid_sampler = SequentialSampler(valid_data) valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs) model = RobertaForTokenClassification.from_pretrained("roberta-base", num_labels=len(tag2idx)) model.cuda() FULL_FINETUNING = True if FULL_FINETUNING: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate':
def train(no_cache: bool, dataset_path: str, data_config_name: str, training_args: TrainingArguments, tokenizer: RobertaTokenizerFast): print(f"tokenizer vocab size: {tokenizer.vocab_size}") print(f"\nLoading and tokenizing datasets found in {dataset_path}.") train_dataset, eval_dataset, test_dataset = load_dataset( 'EMBO/sd-nlp', # './tokcl/loader.py', data_config_name, script_version="main", # data_dir=dataset_path, split=["train", "validation", "test"], # download_mode=GenerateMode.FORCE_REDOWNLOAD if no_cache else GenerateMode.REUSE_DATASET_IF_EXISTS, cache_dir=CACHE) print(f"\nTraining with {len(train_dataset)} examples.") print(f"Evaluating on {len(eval_dataset)} examples.") if data_config_name in ["NER", "ROLES"]: # use our fancy data collator that randomly masks some of the inputs to enforce context learning training_args.remove_unused_columns = False # we need tag_mask data_collator = DataCollatorForMaskedTokenClassification( tokenizer=tokenizer, max_length=config.max_length, masking_probability=training_args.masking_probability, replacement_probability=training_args.replacement_probability, select_labels=training_args.select_labels) else: # normal token classification data_collator = DataCollatorForTokenClassification( tokenizer=tokenizer, max_length=config.max_length) num_labels = train_dataset.info.features['labels'].feature.num_classes label_list = train_dataset.info.features['labels'].feature.names print(f"\nTraining on {num_labels} features:") print(", ".join(label_list)) compute_metrics = MetricsComputer(label_list=label_list) model = RobertaForTokenClassification.from_pretrained( LM_MODEL_PATH, num_labels=num_labels, max_position_embeddings=config.max_length + 2) print("\nTraining arguments:") print(training_args) trainer = Trainer(model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, callbacks=[ShowExample(tokenizer)]) print(f"CUDA available: {torch.cuda.is_available()}") trainer.train() trainer.save_model(training_args.output_dir) print(f"Testing on {len(test_dataset)}.") pred: NamedTuple = trainer.predict(test_dataset, metric_key_prefix='test') print(f"{pred.metrics}")
bert_out_address = 'models/' if not os.path.exists(bert_out_address): os.makedirs(bert_out_address) # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(bert_out_address, "pytorch_model.bin") output_config_file = os.path.join(bert_out_address, "config.json") # Save model into file torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(bert_out_address) # %% model = RobertaForTokenClassification.from_pretrained(bert_out_address, num_labels=len(tag2idx)) # Set model to GPU model.cuda() if n_gpu > 1: model = torch.nn.DataParallel(model) # Evalue loop model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 y_true = [] y_pred = [] print("***** Running evaluation *****") print(" Num examples ={}".format(len(val_inputs))) print(" Batch size = {}".format(batch_num)) for step, batch in enumerate(valid_dataloader):
from argparse import ArgumentParser from transformers import pipeline, RobertaForTokenClassification from common import TOKCL_MODEL_PATH from common.config import config _EXAMPLE = """<s> F. Western blot of input and eluates of Upf1 domains purification in a Nmd4-HA strain. The band with the # might corresponds to a dimer of Upf1-CH, bands marked with a star correspond to residual signal with the anti-HA antibodies (Nmd4). Fragments in the eluate have a smaller size because the protein A pa rt of the tag was removed by digestion with the TEV protease. G6PDH served as a loading control in the input samples </s>""" if __name__ == "__main__": parser = ArgumentParser(description="Quick try of a NER model") parser.add_argument("text", nargs="?", default=_EXAMPLE, help="Text to analyze.") parser.add_argument("-M", "--model-path", default={TOKCL_MODEL_PATH}, help="Path to the model.") args = parser.parse_args() text = args.text model_path = args.model_path model = RobertaForTokenClassification.from_pretrained(model_path) tokenizer = config.tokenizer pipe = pipeline('ner', model, tokenizer=tokenizer) res = pipe(text) for r in res: print(r['word'], r['entity'])
from transformers import BertTokenizer, RobertaForTokenClassification, DataProcessor if torch.cuda.is_available(): # Tell PyTorch to use the GPU. device = torch.device("cuda") print('There are %d GPU(s) available.' % torch.cuda.device_count()) print('We will use the GPU:', torch.cuda.get_device_name(0)) else: print('No GPU available, using the CPU instead.') device = torch.device("cpu") tokenizer = BertTokenizer.from_pretrained("RoBERTa_zh_Large_PyTorch") bert_model = RobertaForTokenClassification.from_pretrained( "./RoBERTa_zh_Large_PyTorch/", # 使用 12-layer 的 BERT 模型. num_labels=5000, # 多分类任务的输出标签为 len(tag2idx)个. output_attentions=False, # 不返回 attentions weights. output_hidden_states=False, # 不返回 all hidden-states. ) class enity_identifing(nn.Module): def __init__(self, vocab_size, embedding_dim, bert_model): super(enity_identifing, self).__init__() self.bert_model = bert_model.to(device) self.embed = nn.Embedding(vocab_size, embedding_dim) initrange = 0.1 self.embed.weight.data.uniform_(-initrange, initrange) self.rnn_type = "LSTM" self.nhid = 512 self.rnn = nn.LSTM(5000, self.nhid, bidirectional=True, dropout=0.5).to(device)
def sentenceLabel(sentence): f = open('./model_save/tag2idx.pckl', 'rb') tag2idx = pickle.load(f) device = torch.device("cpu") output_dir = './model_save/' idx2tag = dict((v,k) for k, v in tag2idx.items()) tokenizer = RobertaTokenizer.from_pretrained(output_dir) model = RobertaForTokenClassification.from_pretrained(output_dir) model.aux_logits = False model.to(device) tokenized_sentence = tokenizer.encode(sentence) input_ids = torch.tensor([tokenized_sentence]).to(device) all_tokens = [] origin_tokens = sentence.split(' ') print(origin_tokens) all_entities = [] entity_types = [] tokenized_sentence = tokenizer.encode(sentence) input_ids = torch.tensor([tokenized_sentence]).to(device) predictions = [] with torch.no_grad(): output = model(input_ids) output = output[0].detach().cpu().numpy() predictions.extend([list(p) for p in np.argmax(output, axis=2)]) tags_predictions = [] for x in predictions[0]: tags_predictions.append(idx2tag[int(x)]) tokens = [] count = 0 ### get tokens from ids for x in tokenizer.convert_ids_to_tokens(tokenized_sentence): if count == 1: tokens.append(x) elif x[0] == 'Ġ': tokens.append(x[1:]) else: tokens.append(x) count+=1 wordIndex = 0 startIndex = 0 entityIndex = 0 entity_types.append(tags_predictions[1:-1]) for x in tokens[1:-1]: entity = entity_types[0][entityIndex] entityIndex += 1 if wordIndex == len(origin_tokens): break if x in origin_tokens[wordIndex].lower(): if startIndex == 0: all_tokens.append(origin_tokens[wordIndex]) if(len(entity) < 2): all_entities.append(entity) else: all_entities.append(entity[2:]) startIndex = startIndex + len(x) if startIndex >= len(origin_tokens[wordIndex]): wordIndex += 1 startIndex = 0 print(all_tokens) print(all_entities) return all_tokens,all_entities
parser.add_argument("--bsz", type=int) parser.add_argument("--path-to-parallel-data-json") parser.add_argument("--output-path") parser.add_argument("--update-freq", type=int, default=1) args = parser.parse_args() with open(args.path_to_parallel_data_json, "r") as f: text = [i.strip() for i in f.readlines()] tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base") dataset = NERDataset( text, None, tokenizer, label_mapping=["KEEP", "MASK", "DELETE"] ) config = RobertaConfig.from_pretrained("roberta-base") config.num_labels = 3 model = RobertaForTokenClassification.from_pretrained( args.hf_dump, config=config ).cuda() print("Loaded roberta model") output_masks = open(os.path.join(args.output_path, "masks.txt"), "w") list_of_edits = [] total_out_list = [] for v, example in enumerate(DataLoader(dataset, batch_size=args.bsz)): output = model(**example) out_list = [] for i, x in enumerate(output.logits.argmax(-1)): current_sent = [] current_edits = [] for j, tok in enumerate(tokenizer.tokenize(text[args.bsz * v + i])): val = x[j + 1]
args = parser.parse_args() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("Using device {}.".format(device)) torch.manual_seed(args.seed) # ------ LOAD DATA ------ train_loader = _get_data_loader(args.batch_size, args.data_dir, "train_roberta.csv", args.max_len) test_loader = _get_data_loader(args.batch_size, args.val_dir, "test_roberta.csv", args.max_len) # ------ CREATE ROBERTA MODEL ------ model = RobertaForTokenClassification.from_pretrained( 'roberta-base', num_labels=args.n_tags).to(device) # ------ SPECIFY OPTIMIZER ------ no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [{ "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01 }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ],
label_list = processor.get_labels(data) num_labels = len(label_list) if task in ['pos-tagging', 'ner']: num_labels += 1 # we add 1 because of the padding which is labelled 0 logging.info("\tDone.") logging.info( "Fetching pre-trained RoBerta model: {} and Tokenizer: {} for the task: {}..." .format(parameters['pretrained_model'], parameters['pretrained_tokenizer'], parameters['task'])) if task in ['pos-tagging', 'ner']: model = RobertaForTokenClassification.from_pretrained( parameters['pretrained_model'], num_labels= num_labels, # The number of output labels for classification. output_attentions=parameters[ 'output_attentions'], # Whether the model returns attentions weights. output_hidden_states=parameters[ 'output_hidden_states'], # Whether the model returns all hidden-states. ) elif task in ['sentence-classification']: model = RobertaForSequenceClassification.from_pretrained( parameters['pretrained_model'], num_labels= num_labels, # The number of output labels for classification. output_attentions=parameters[ 'output_attentions'], # Whether the model returns attentions weights. output_hidden_states=parameters[ 'output_hidden_states'], # Whether the model returns all hidden-states. ) tokenizer = RobertaTokenizer.from_pretrained(
accumulation_steps = args.update_freq with open(args.path_to_parallel_data_json, "r") as f: data = [json.loads(i.strip()) for i in f.readlines()] text = [i["source"] for i in data] labels = [i["label"] for i in data] tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base") dataset = NERDataset(text, labels, tokenizer, label_mapping=["KEEP", "MASK", "DELETE"]) model = RobertaForTokenClassification.from_pretrained( f"{args.hf_dump}/pytorch_model.bin", config=f"{args.hf_dump}/config.json", num_labels=len(dataset.label_mapping), hidden_dropout_prob=0.2, attention_probs_dropout_prob=0.2, ).cuda() optimizer = optim.Adam(model.parameters(), lr=3e-6) scheduler = transformers.get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=5000, num_training_steps=40000, ) os.makedirs(args.save_dir, exist_ok=False) for _ in range(args.epochs): for i, example in tqdm( enumerate(
from transformers import RobertaTokenizer, RobertaForTokenClassification import torch tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = RobertaForTokenClassification.from_pretrained('roberta-base') s = 'Hello, my dog is cute' encoded_tokens = tokenizer.encode(s, add_special_tokens=True) input_ids = torch.tensor(encoded_tokens).unsqueeze(0) # Batch size 1 labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, scores = outputs[:2]
entities[ "special_tokens_mask"] = special_tokens_mask # restore special_tokens_mask for potential carry over to next serial model output.append(entities) return output if __name__ == "__main__": parser = ArgumentParser(description="SmartTagging of free text.") parser.add_argument( "text", nargs="?", default= "We studied mice with genetic ablation of the ERK1 gene in brain and muscle.", help="The text to tag.") args = parser.parse_args() text = args.text panel_model = RobertaForTokenClassification.from_pretrained( f"EMBO/sd-panels") ner_model = RobertaForTokenClassification.from_pretrained(f"EMBO/sd-ner") role_model = RobertaForTokenClassification.from_pretrained( f"EMBO/sd-roles") tokenizer = RobertaTokenizerFast.from_pretrained(f"roberta-base") tagger = Tagger( tokenizer, panel_model, # segments figure legends into panel legends ner_model, # tags biolgical entities role_model # semantic roles of entities ) tagged = tagger(text) print(tagged)