def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False): from transformers import AutoModelForTokenClassification, AutoTokenizer # download the model or load the model path weights_path = download_model('bert.ner', cache_dir, process_func=_unzip_process_func, verbose=verbose) self.label_list = ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"] self.model = AutoModelForTokenClassification.from_pretrained(weights_path) self.tokenizer = AutoTokenizer.from_pretrained(weights_path)
def pack_ner(): svc = NERService() model_name = "dbmdz/bert-large-cased-finetuned-conll03-english" tokenizer_name = "bert-base-cased" model = AutoModelForTokenClassification.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) artifact = { "model": model, "tokenizer": tokenizer } svc.pack("model", artifact) print(f"NER service packed: {svc.save()}")
def __init__(self, model_type: str = "BERT", model_name: str = "dslim/bert-base-NER", load_path: str = ""): self.adaptor = get_adaptor(model_type) if load_path != "": model = AutoModelForTokenClassification.from_pretrained(load_path) else: model = AutoModelForTokenClassification.from_pretrained(model_name) super().__init__(model_type, model_name, model) device_number = detect_cuda_device_number() self._pipeline = TokenClassificationPipeline(model=self.model, tokenizer=self.tokenizer, device=device_number) self._trainer = TOCTrainer(self.model, model_type, self.tokenizer, self._device, self.logger)
def _build_model(self): config = AutoConfig.from_pretrained( self._model_name, num_labels=len(self._labels), id2label=self._label_map, label2id={label: i for i, label in enumerate(self._labels)}, ) model = AutoModelForTokenClassification.from_pretrained( self._model_name, config=config) self._adapter_internal_name = model.load_adapter( self._adapter_name, "text_task") return model
def evaluate( _log, _run, temperature=1.0, artifacts_dir="artifacts", load_params="model.pth", device="cpu", save_confusion_matrix=False, ): """Evaluate a trained target model.""" model_name = "clulab/roberta-timex-semeval" _log.info("Loading %s", model_name) config = AutoConfig.from_pretrained(model_name) token_clf = AutoModelForTokenClassification.from_pretrained(model_name, config=config) model = RoBERTagger(token_clf, config.num_labels, temperature) artifacts_dir = Path(artifacts_dir) _log.info("Loading model parameters from %s", artifacts_dir / load_params) model.load_state_dict(torch.load(artifacts_dir / load_params, "cpu")) model.to(device) _log.info("Evaluating") eval_score, _ = run_eval(model, config.id2label, read_samples_(), confusion=save_confusion_matrix) c = eval_score.pop("confusion", None) print_accs(eval_score, on="test", run=_run) if c is not None: labels = set() for k in c.keys(): labels.update(k) if "O" in labels: labels.remove("O") labels = sorted(labels) labels.insert(0, "O") label2id = {l: i for i, l in enumerate(labels)} m = np.zeros((len(labels), len(labels))) for k, cnt in c.items(): m[label2id[k[0]], label2id[k[1]]] = cnt _log.info("Saving labels list in %s", artifacts_dir / "labels.pkl") with open(artifacts_dir / "labels.pkl", "wb") as f: pickle.dump(labels, f) _log.info("Saving confusion matrix in %s", artifacts_dir / "confusion.npy") np.save(artifacts_dir / "confusion.npy", m) return eval_score["f1"]
def load_model(pred_config, args, device): # Check whether model exists if not os.path.exists(pred_config.model_dir): raise Exception("Model doesn't exists! Train first!") try: model = AutoModelForTokenClassification.from_pretrained(args.model_dir) # Config will be automatically loaded from model_dir model.to(device) model.eval() logger.info("***** Model Loaded *****") except: raise Exception("Some model files might be missing...") return model
def __init__(self): self.tokenizer = AutoTokenizer.from_pretrained( "Alaeddin/convbert-base-turkish-ner-cased") self.model = AutoModelForTokenClassification.from_pretrained( "Alaeddin/convbert-base-turkish-ner-cased") self.config = PretrainedConfig.from_pretrained( "Alaeddin/convbert-base-turkish-ner-cased") self.pipeline = pipeline('ner', model=self.model, tokenizer=self.tokenizer, config=self.config) self.nlp = spacy.load("en_core_web_sm") self.nlp_grouped = TokenClassificationPipeline( model=self.model, tokenizer=self.tokenizer, grouped_entities=True)
def load(cls, model_name_or_path: str) -> AdaptiveModel: """Class method for loading and constructing this tagger * **model_name_or_path** - A key string of one of Transformer's pre-trained Token Tagger Model or a `HFModelResult` Note: To search for valid models, you should use the AdaptNLP `model_hub` API """ if isinstance(model_name_or_path, HFModelResult): model_name_or_path = model_name_or_path.name tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) model = AutoModelForTokenClassification.from_pretrained( model_name_or_path) tagger = cls(tokenizer, model) return tagger
def __init__(self): self.gen_model = AutoModelForCausalLM.from_pretrained( 'model/transformers/gen_model') self.gen_tokenizer = AutoTokenizer.from_pretrained( 'model/transformers/gen_tokenizer' ) # Add specific options if needed self.chat_tokenizer = AutoTokenizer.from_pretrained( "model/transformers/chat_tokenizer") self.chat_model = AutoModelForCausalLM.from_pretrained( "model/transformers/chat_model") self.ner_model = AutoModelForTokenClassification.from_pretrained( "model/transformers/ner_model") self.ner_tokenizer = AutoTokenizer.from_pretrained( "model/transformers/ner_tokenizer")
def load_model(self): tokenizer = AutoTokenizer.from_pretrained(self.model_path) model = AutoModelForTokenClassification.from_pretrained(self.model_path) model.to(self.device) f2=open(self.labels_file,'r') lablels_dict={} for i,line in enumerate(f2): # l=line.split(" ") l=line.replace("\n",'') # print(l) lablels_dict[i]=l f2.close() self.lablels_dict=lablels_dict return model,tokenizer
def transformers_model_downloader(app): model_file = os.path.join(app, app + ".pt") if os.path.isfile(model_file): print("model already downloaded: " + model_file) return print("Download model for: ", model_file) if app == "text_classification": model_name = "bert-base-uncased" max_length = 150 model = AutoModelForSequenceClassification.from_pretrained( model_name, torchscript=True, num_labels=2) tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True) elif app == "question_answering": model_name = "distilbert-base-uncased-distilled-squad" max_length = 128 model = AutoModelForQuestionAnswering.from_pretrained(model_name, torchscript=True) tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True) elif app == "token_classification": model_name = "bert-base-uncased" max_length = 150 model = AutoModelForTokenClassification.from_pretrained( model_name, torchscript=True, num_labels=9) tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True) else: print("Unknown application: " + app) return text = "How is the weather" paraphrase = tokenizer.encode_plus(text, max_length=max_length, truncation=True, padding='max_length', add_special_tokens=True, return_tensors='pt') example_inputs = paraphrase['input_ids'], paraphrase['attention_mask'] traced_model = torch.neuron.trace(model, example_inputs) # Export to saved model os.makedirs(app, exist_ok=True) traced_model.save(model_file) tokenizer.save_pretrained(app) logging.info("Compile model %s success.", app)
def initialize(self, ctx): self.manifest = ctx.manifest self.metrics = ctx.metrics logger.info(f"Manifest: {self.manifest}") properties = ctx.system_properties self._batch_size = properties["batch_size"] logger.info(f"properties: {properties}") model_dir = properties.get("model_dir") self.device = torch.device("cuda:" + str(properties.get("gpu_id")) if torch.cuda. is_available() else "cpu") labels = get_labels(os.path.join(model_dir, "labels.txt")) label_map = {i: label for i, label in enumerate(labels)} num_labels = len(labels) config = AutoConfig.from_pretrained( os.path.join(model_dir, "config.json"), num_labels=num_labels, id2label=label_map, label2id={label: i for i, label in enumerate(labels)}, ) # Read model serialize/pt file self.model = AutoModelForTokenClassification.from_pretrained( model_dir, config=config) self.tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=True) self.nlp = pipeline( "ner", model=self.model, tokenizer=self.tokenizer, ignore_labels=[], grouped_entities=True, # ignore_subwords=True, device=self.device.index, ) logger.debug( "Transformer model from path {0} loaded successfully".format( model_dir)) self.initialized = True
def __init__(self, cfg: DictConfig, trainer: Trainer = None): self._tokenizer = AutoTokenizer.from_pretrained(cfg.tokenizer, add_prefix_space=True) super().__init__(cfg=cfg, trainer=trainer) self.num_labels = len(constants.ALL_TAG_LABELS) self.model = AutoModelForTokenClassification.from_pretrained( cfg.transformer, num_labels=self.num_labels) # Loss Functions self.loss_fct = nn.CrossEntropyLoss( ignore_index=constants.LABEL_PAD_TOKEN_ID) # setup to track metrics self.classification_report = ClassificationReport( self.num_labels, mode='micro', dist_sync_on_step=True)
def __init__(self, model_name="bert-base-cased", num_labels=2, cache_dir="."): config = AutoConfig.from_pretrained(model_name, num_labels=num_labels, cache_dir=cache_dir) model = AutoModelForTokenClassification.from_pretrained( model_name, cache_dir=cache_dir, config=config, output_loading_info=False) super().__init__(model_name=model_name, model=model, cache_dir=cache_dir)
def __init__(self, model_name="bert-base-cased-finetuned-mrpc", task="SequenceClassification"): self.tokenizer = AutoTokenizer.from_pretrained(model_name) if task == "SC": self.model = AutoModelForSequenceClassification.from_pretrained( model_name) elif task == "QA": self.model = AutoModelForQuestionAnswering.from_pretrained( model_name) elif task == "LM": self.model = AutoModelWithLMHead.from_pretrained(model_name) elif task == "TC": self.model = AutoModelForTokenClassification.from_pretrained( "dbmdz/bert-large-cased-finetuned-conll03-english")
def __init__(self, **data: Any): super().__init__(**data) model = AutoModelForTokenClassification.from_pretrained(self.model_name_or_path) if self.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=True) else: tokenizer = None self._pipeline = pipeline( 'ner', model=model, tokenizer=tokenizer, grouped_entities=self.grouped_entities )
def __init__( self, checkpoint_directory: str, batch_size: int = 16, max_seq_length: int = None, ): """ Args: checkpoint_directory: path batch_size: used in dataloader """ # 0. device self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # 1: max_seq_length if max_seq_length is not None: self.max_seq_length = max_seq_length else: path_max_seq_length = join(checkpoint_directory, "max_seq_length.json") with open(path_max_seq_length, "r") as f: self.max_seq_length = json.load(f) # 2. annotation path_annotation_classes = join(checkpoint_directory, "annotation_classes.json") with open(path_annotation_classes, "r") as f: self.annotation_classes = json.load(f) id2label = {i: label for i, label in enumerate(self.annotation_classes)} label2id = {label: i for i, label in id2label.items()} self.annotation_scheme = derive_annotation_scheme(id2label) # 3. model self.model = AutoModelForTokenClassification.from_pretrained( checkpoint_directory, id2label=id2label, label2id=label2id, return_dict=False, ) self.model.eval() self.model = self.model.to(self.device) # 4. tokenizer self.tokenizer = AutoTokenizer.from_pretrained( checkpoint_directory, ) # 5. batch_size (dataloader) self.batch_size = batch_size
def create_base_model(exp_config): labels = exp_config["labels"] config = AutoConfig.from_pretrained( exp_config["model_name"], num_labels=len(labels), id2label={str(i): label for i, label in enumerate(labels)}, label2id={label: i for i, label in enumerate(labels)}, ) model = AutoModelForTokenClassification.from_pretrained( exp_config["model_name"], config=config) model.to(exp_config["device"]) return model
def load(self, train_output): pretrained_model = train_output['model_path'] self._model_type = train_output['model_type'] _, model_class, tokenizer_class = MODEL_CLASSES[train_output['model_type']] self._tokenizer = AutoTokenizer.from_pretrained(pretrained_model) self._model = AutoModelForTokenClassification.from_pretrained(pretrained_model) self._batch_size = train_output['batch_size'] self._pad_token = self._tokenizer.convert_tokens_to_ids([self._tokenizer.pad_token])[0] self._pad_token_label_id = train_output['pad_token_label_id'] self._label_map = train_output['label_map'] self._mask_padding_with_zero = True self._dataset_params_dict = train_output['dataset_params_dict'] self._batch_padding = SpanLabeledTextDataset.get_padding_function( self._model_type, self._tokenizer, self._pad_token_label_id)
def create_pipeline(model_name): config = AutoConfig.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained( model_name, use_fast=True, return_offsets_mapping=True ) model = AutoModelForTokenClassification.from_pretrained( model_name, config=config ) NER_pipeline = TokenClassificationPipeline(model= model,tokenizer=tokenizer, framework='pt', task='ner', grouped_entities=True) return NER_pipeline
def on_epoch_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): model_path = args.output_dir + "/checkpoint-" + str(956 * int(state.epoch)) global label_map label_map = {i: label for i, label in enumerate(self.labels)} num_labels = len(self.labels) config = AutoConfig.from_pretrained( model_path, num_labels=num_labels, id2label=label_map, label2id={label: i for i, label in enumerate(self.labels)}, cache_dir=None, ) model = AutoModelForTokenClassification.from_pretrained( model_path, from_tf=False, config=config, cache_dir=None, ) trainer = Trainer( model=model, ) m1, m2, m3 = student_performance(trainer, self.teacher_sets) results = str(m1['precision']) + ", " + str(m1['recall']) + ", " + str(m1['f1']) + ", " + str( m2['precision']) + ", " + str(m2['recall']) + ", " + str(m2['f1']) + ", " + str( m3['precision']) + ", " + str(m3['recall']) + ", " + str(m3['f1']) + "\n" f = open(args.output_dir + "/results.csv", "a") f.write(results) f.close() print(results) if (m1['f1'] + m3['f1']) / 2 <= self.best_f1: delete_filename = model_path + "/pytorch_model.bin" open(delete_filename, 'w').close() os.remove(delete_filename) delete_filename = model_path + "/optimizer.pt" open(delete_filename, 'w').close() os.remove(delete_filename) print("deleted") else: self.best_f1 = (m1['f1'] + m3['f1']) / 2
def keyword_extractor(input_sentence, pos_list = ['0', '1', '2']): input_words = ['<SOS>'] + nltk.word_tokenize(input_sentence) + ['<EOS>'] tokenizer = AutoTokenizer.from_pretrained("vblagoje/bert-english-uncased-finetuned-pos") model = AutoModelForTokenClassification.from_pretrained("vblagoje/bert-english-uncased-finetuned-pos") inputs = tokenizer(input_sentence) output = model(torch.tensor(inputs['input_ids']).unsqueeze(0)) output_label = torch.argmax(output.logits.data.squeeze(0), dim=1) # what we need are ADJ and ADV keywords = list() for i, label in enumerate(output_label.tolist()): if str(label) in pos_list: keywords.append(input_words[i]) return keywords
def __init__(self, output_dir, labels: List[str], ignore_sub_tokens_labes: bool, spliting_strategy: Optional[str], sentence_strategy: Optional[str], prediction_strategy: Optional[str], model_name_or_path=None, loaded_model=None): self.output_dir = output_dir self.labels = labels self.num_labels = len(self.labels) self.label_map: Dict[int, str] = { i: label for i, label in enumerate(self.labels) } self.label2id = {label: i for i, label in enumerate(self.labels)} self.ignore_sub_tokens_labes = ignore_sub_tokens_labes self.spliting_strategy = spliting_strategy self.sentence_strategy = sentence_strategy self.prediction_strategy = prediction_strategy if loaded_model is not None: self.config, self.tokenizer, self.model = loaded_model else: tokenizer_name = model_name_or_path config_name = model_name_or_path self.config = AutoConfig.from_pretrained( config_name, num_labels=self.num_labels, id2label=self.label_map, label2id=self.label2id, ) self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, ) self.model = AutoModelForTokenClassification.from_pretrained( model_name_or_path, from_tf=bool(".ckpt" in model_name_or_path), config=self.config, ) self.max_seq_length = 128
def main(): config = AutoConfig.from_pretrained("bert-base-cased", num_labels=93) tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") model = AutoModelForTokenClassification.from_pretrained("bert-base-cased", from_tf=bool(".ckpt" in "bert-base-cased"), config=config) criterion = nn.CrossEntropyLoss() batch_size = 1 # if torch.cuda.is_available(): # model.load_state_dict( # torch.load('bert-base-cased') # ) # else: # model.load_state_dict( # torch.load('bert-base-cased', map_location=torch.device('cpu')) # ) # with open('../label_encoder.sklrn', 'rb') as f: # le = pickle.load(f) test_example = [ ["Interpretation of HuggingFase's model decision"], ["Transformer-based models have taken a leading role " "in NLP today."] ] test_dataset = NewsDataset( data_list=test_example, tokenizer=tokenizer, max_length=config.max_position_embeddings, ) test_dataloader = DataLoader( test_dataset, batch_size=batch_size, shuffle=False, ) integrated_grad = IntegratedGradient( model, criterion, tokenizer, show_progress=True, encoder="bert" ) instances = integrated_grad.saliency_interpret(test_dataloader)
def __init__(self, hparams, tag2idx): super().__init__() self.embedding_dim = hparams['embedding_dim'] self.batch_size = hparams['batch_size'] self.seq_length = hparams['seq_length'] self.device = hparams['device'] self.tag2idx = tag2idx self.tagset_size = len(tag2idx) self.bert = AutoModelForTokenClassification.from_pretrained( hparams['bert'], output_hidden_states=True) self.hidden2tag = nn.Linear(self.embedding_dim, self.tagset_size) self.Softmax = nn.Softmax(dim=-1)
def __init__(self, model_name="dumitrescustefan/bert-base-romanian-cased-v1", tokenizer_name=None, lr=2e-05, model_max_length=512, bio2tag_list=[], tag_list=[]): super().__init__() if tokenizer_name is None or tokenizer_name == "": tokenizer_name = model_name print("Loading AutoModel [{}] ...".format(model_name)) self.model_name = model_name self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, strip_accents=False) self.model = AutoModelForTokenClassification.from_pretrained( model_name, num_labels=len(bio2tag_list), from_flax=False) self.dropout = nn.Dropout(0.2) self.lr = lr self.model_max_length = model_max_length self.bio2tag_list = bio2tag_list self.tag_list = tag_list self.num_labels = len(bio2tag_list) self.train_loss = [] self.valid_y_hat = [] self.valid_y = [] self.valid_loss = [] self.test_y_hat = [] self.test_y = [] self.test_loss = [] # check cls, sep and pad tokens if self.tokenizer.cls_token_id is None: print( f"*** Warning, tokenizer {tokenizer_name} has no defined CLS token: sequences will not be marked with special chars! ***" ) if self.tokenizer.sep_token_id is None: print( f"*** Warning, tokenizer {tokenizer_name} has no defined SEP token: sequences will not be marked with special chars! ***" ) # add pad token self.validate_pad_token()
def load(output_dir): config = AutoConfig.from_pretrained(output_dir) tokenizer = AutoTokenizer.from_pretrained(output_dir) model = AutoModelForTokenClassification.from_pretrained(output_dir) with open(f'{output_dir}/settings.json', 'r') as outfile: data = json.load(outfile) ignore_sub_tokens_labes = data.get("ignore_sub_tokens_labes", False) spliting_strategy = data.get("spliting_strategy", None) sentence_strategy = data.get("sentence_strategy", None) prediction_strategy = data.get("prediction_strategy", None) return TokenClassifier(output_dir=output_dir, labels=data["labels"], loaded_model=(config, tokenizer, model), sentence_strategy=sentence_strategy, spliting_strategy=spliting_strategy, prediction_strategy=prediction_strategy, ignore_sub_tokens_labes=ignore_sub_tokens_labes)
def _convert_to_transformers_ner(adaptive_model, prediction_head): # add more info to config adaptive_model.language_model.model.config.num_labels = prediction_head.num_labels adaptive_model.language_model.model.config.id2label = {id: label for id, label in enumerate(prediction_head.label_list)} adaptive_model.language_model.model.config.label2id = {label: id for id, label in enumerate(prediction_head.label_list)} adaptive_model.language_model.model.config.finetuning_task = "token_classification" adaptive_model.language_model.model.config.language = adaptive_model.language_model.language # init model transformers_model = AutoModelForTokenClassification.from_config(adaptive_model.language_model.model.config) # transfer weights for language model + prediction head setattr(transformers_model, transformers_model.base_model_prefix, adaptive_model.language_model.model) transformers_model.classifier.load_state_dict( prediction_head.feed_forward.feed_forward[0].state_dict()) return transformers_model
def get_this_model(task, model_config): from transformers import AutoModelForSequenceClassification from transformers import AutoModelForSeq2SeqLM from transformers import AutoModelForMultipleChoice from transformers import AutoModelForTokenClassification if task in (SEQCLASSIFICATION, SEQREGRESSION): return AutoModelForSequenceClassification.from_pretrained( checkpoint_path, config=model_config) elif task == TOKENCLASSIFICATION: return AutoModelForTokenClassification.from_pretrained( checkpoint_path, config=model_config) elif task in NLG_TASKS: return AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path, config=model_config) elif task == MULTICHOICECLASSIFICATION: return AutoModelForMultipleChoice.from_pretrained( checkpoint_path, config=model_config)
def _preparations_data_predict(self): """ :created attr: annotation [Annotation] :created attr: model [transformers AutoModelForTokenClassification] :return: - """ # annotation self.annotation = Annotation( json.loads(self.hparams.annotation_classes)) # model self.model = AutoModelForTokenClassification.from_pretrained( self.pretrained_model_name, num_labels=len(self.annotation.classes), return_dict=False, ) self.model.resize_token_embeddings(len( self.tokenizer)) # due to addtional_special_tokens