def __init__(self, modelname="", dataset=None, use_saved_model=False): self.dataset = dataset #labels_list = ["O", "B-ACT", "I-ACT", "B-OBJ", "I-OBJ", "B-VAL", "I-VAL", "B-VAR", "I-VAR"] #labels_list = dataset.get_labels_list() labels_list = dataset['labels_list'] output_dir = "outputs_{}".format(modelname) # Create a NERModel model_args = { 'output_dir': output_dir, 'overwrite_output_dir': True, 'reprocess_input_data': True, 'save_eval_checkpoints': False, 'save_steps': -1, 'save_model_every_epoch': False, 'train_batch_size': 10, # 10 'num_train_epochs': 10, # 5 'max_seq_length': 256, 'gradient_accumulation_steps': 8, 'labels_list': labels_list } if use_saved_model: self.model = NERModel("bert", output_dir, use_cuda=False, args=model_args) else: self.model = NERModel("bert", "bert-base-cased", use_cuda=False, args=model_args)
def predict(sentence): if sentence: model1 = NERModel('bert', 'NERMODEL1', labels=[ "B-sector", "I-sector", "B-funda", "O", "operator", "threshold", "Join", "B-attr", "I-funda", "TPQty", "TPUnit", "Sortby", "B-eco", "I-eco", "B-index", "Capitalization", "I-", "funda", "B-security", 'I-security', 'Number', 'Sector', 'TPMonth', 'TPYr', 'TPRef' ], args={ "save_eval_checkpoints": False, "save_steps": -1, "output_dir": "NERMODEL", 'overwrite_output_dir': True, "save_model_every_epoch": False, 'reprocess_input_data': True, "train_batch_size": 10, 'num_train_epochs': 15, "max_seq_length": 64 }, use_cuda=False) predictions, raw_outputs = model1.predict([sentence]) result = json.dumps(predictions[0]) return result
def training(): wandb.init() model = NERModel("roberta", "roberta-base", use_cuda=True, args=model_args, sweep_config=wandb.config) # model = NERModel("distilbert", "distilbert-base-cased", use_cuda=True, args=model_args, sweep_config=wandb.config) model.train_model(train_df, eval_data=trial_df) wandb.join()
def roberta_ner_train(self): train_df = pd.DataFrame(self.roberta_ner_train_data, columns=['sentence_id', 'words', 'labels']) self.roberta_ner_model = NERModel("roberta", "roberta-base", labels=self.labels, args={ "num_train_epochs": 3, "overwrite_output_dir": True, "output_dir": "ner_outputs/" }) self.roberta_ner_model.train_model(train_df)
def __init__(self, ner_path: str, deps_path: str): """Create/Load a new DependencyModel Args: ner_path (str): directory of NER model. (if not exists, create a new model) deps_path (str): directory of classification model. """ self.ner_path = ner_path try: self.ner_model = NERModel("distilbert", ner_path, use_cuda=False) except: self.ner_model = NERModel("distilbert", "distilbert-base-uncased", labels=custom_labels, use_cuda=False) self.deps_model = ClassifyModel(deps_path)
def create_model(model_class, model_type, model_name, num_labels, weight, args, use_cuda, cuda_device, **kwargs): if model_class == "ClassificationModel": return ClassificationModel(model_type, model_name, num_labels, weight, args, use_cuda, cuda_device, **kwargs) elif model_class == "MultiLabelClassificationModel": return MultiLabelClassificationModel(model_type, model_name, num_labels, weight, args, use_cuda, cuda_device, **kwargs) elif model_class == "QuestionAnsweringModel": return QuestionAnsweringModel(model_type, model_name, args, use_cuda, cuda_device, **kwargs) elif model_class == "NERModel": return NERModel(model_type, model_name, args=args, use_cuda=use_cuda, cuda_device=cuda_device, **kwargs) elif model_class == "T5Model": args = T5Args() args.use_multiprocessed_decoding = False return T5Model(model_type, model_name, args=args, use_cuda=use_cuda, cuda_device=cuda_device, **kwargs) else: raise ValueError( "{} is either invalid or not yet implemented.".format(model_class))
def load_models(): """Load POS and NER trained telugu models.""" pos_model = NERModel('bert', 'kuppuluri/telugu_bertu_pos', args={"use_multiprocessing": False}, labels=[ 'QC', 'JJ', 'NN', 'QF', 'RDP', 'O', 'NNO', 'PRP', 'RP', 'VM', 'WQ', 'PSP', 'UT', 'CC', 'INTF', 'SYMP', 'NNP', 'INJ', 'SYM', 'CL', 'QO', 'DEM', 'RB', 'NST', ], use_cuda=False) ner_model = NERModel('bert', 'kuppuluri/telugu_bertu_ner', labels=[ 'B-PERSON', 'I-ORG', 'B-ORG', 'I-LOC', 'B-MISC', 'I-MISC', 'I-PERSON', 'B-LOC', 'O' ], use_cuda=False, args={"use_multiprocessing": False}) spacy_telugu_model = spacy.blank("te") return pos_model, ner_model, spacy_telugu_model
def predict(): if request.method == 'POST': test_sents = pd.read_csv(request.files.get('file')) aspect_model = NERModel("bert", "/Users/mutaz/Desktop/Mutaz Thesis bert/hotel_reviews", use_cuda=False, labels=["B-A", "I-A", "O"], args={"use_cuda": False, "save_eval_checkpoints": False, "save_steps": -1, "output_dir": "MODEL", 'overwrite_output_dir': True, "save_model_every_epoch": False, }) predictions, raw_outputs = aspect_model.predict(test_sents["sentence"]) pred = [[tag__ for word_ in s for word__, tag__ in word_.items()] for s in [sentence_ for sentence_ in predictions]] print(pred) # connect(test_sents, aspects_prds) return redirect('https://dub01.online.tableau.com/#/site/absa/views/absa_project/MAIN?:iid=20&:original_view=y')
def __init__(self, wrds_per_pred=250): self.wrds_per_pred = wrds_per_pred self.overlap_wrds = 30 self.valid_labels = [ 'OU', 'OO', '.O', '!O', ',O', '.U', '!U', ',U', ':O', ';O', ':U', "'O", '-O', '?O', '?U' ] self.model = NERModel("bert", "felflare/bert-restore-punctuation", labels=self.valid_labels, args={ "silent": True, "max_seq_length": 512 }) # use_cuda isnt working and this hack seems to load the model correctly to the gpu self.model.device = torch.device("cuda:1") # dummy punctuate to load the model onto gpu self.punctuate("hello how are you")
def main(trainingdataset, testdataset, outputdir): print() # Creat TransformerModel: NERModel: model_class, model_type from huggingface # several attributes can be changes with args -> see self.args, i.e. # args={'learning_rate': 2e-5, 'overwrite_output_dir': True, 'reprocess_input_data': True} #eval_init_df = pd.DataFrame() #eval_init_df = pd.DataFrame(testdataset, columns=['sentence_id', 'words', 'labels']) #is this structure same for wikiaan? #can't simply pass a txt to create a dataframe, need to have csv ... torch.cuda.empty_cache() model = NERModel( 'bert', 'bert-base-multilingual-cased', labels=[ "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC" ], #use_cuda=False, args={ 'save_model_every_epoch': False, 'save_steps': 10000, 'output_dir': outputdir, 'evaluate_during_training': True, 'overwrite_output_dir': True, 'classification_report': True, 'save_eval_checkpoints': False }) #changes Philine in passed arg parameters - 03.04.2020: # - evaluate_during_training: Perform evaluations during training - after every step defined in parameter: evaluate_during_training_steps, by default 2000 ->!eval_df has to be given as an input to train_model() # - classification_report of each label, will be added to eval_results.txt file # save_eval_checkpoints:False -> model won't be saved after each checkpoint -> improving of execution? # save_model_every_epoch can also be set to False! # Train the model model.train_model( trainingdataset, eval_df=testdataset ) # Make sure eval_df is passed to the training method if enabled.
def test_named_entity_recognition(): # Creating train_df and eval_df for demonstration train_data = [ [0, "Simple", "B-MISC"], [0, "Transformers", "I-MISC"], [0, "started", "O"], [1, "with", "O"], [0, "text", "O"], [0, "classification", "B-MISC"], [1, "Simple", "B-MISC"], [1, "Transformers", "I-MISC"], [1, "can", "O"], [1, "now", "O"], [1, "perform", "O"], [1, "NER", "B-MISC"], ] train_df = pd.DataFrame(train_data, columns=["sentence_id", "words", "labels"]) eval_data = [ [0, "Simple", "B-MISC"], [0, "Transformers", "I-MISC"], [0, "was", "O"], [1, "built", "O"], [1, "for", "O"], [0, "text", "O"], [0, "classification", "B-MISC"], [1, "Simple", "B-MISC"], [1, "Transformers", "I-MISC"], [1, "then", "O"], [1, "expanded", "O"], [1, "to", "O"], [1, "perform", "O"], [1, "NER", "B-MISC"], ] eval_df = pd.DataFrame(eval_data, columns=["sentence_id", "words", "labels"]) # Create a NERModel model = NERModel( "bert", "bert-base-cased", args={ "no_save": True, "overwrite_output_dir": True, "reprocess_input_data": False }, use_cuda=False, ) # Train the model model.train_model(train_df) # Evaluate the model result, model_outputs, predictions = model.eval_model(eval_df) # Predictions on arbitary text strings predictions, raw_outputs = model.predict(["Some arbitary sentence"])
def main(): st.title("NER - Simple Transformers") cuda_available = torch.cuda.is_available() custom_labels=['NN','NNP','IN','DT','JJ','NNS','VBD','VBN','VBZ','CD','VB', 'CC','TO','RB','VBG','VBP','PRP','POS','PRP$','MD','WDT','JJS', 'JJR','WP','NNPS','RP','WRB','RBR','EX','RBS','PDT','WP$','UH','FW'] model = NERModel("roberta", "best_model", use_cuda=cuda_available, labels=custom_labels) st.sidebar.subheader("Parameters") model.args.max_seq_length = st.sidebar.slider("Max Seq Length", min_value=1, max_value=92, value=model.args.max_seq_length) model.args.use_multiprocessing = False st.subheader("Digite o texto: ") input = st.text_area("") if st.button("Analisar"): prediction = get_prediction(model, input)[0] st.write(prediction)
from simpletransformers.ner import NERModel, NERArgs label=data["labels"].unique().tolist() label args=NERArgs() args.num_train_epochs=1 args.learning_rate=1e-4 args.overwrite_output_dir=True args.train_batch_size=32 args.eval_batch_size=32 model=NERModel('bert', 'bert-base-cased', labels=label, args=args) model.train_model(train_data, eval_data=test_data, acc=accuracy_score) result, model_outputs, preds_list=model.eval_model(test_data) result prediction, model_output=model.predict(["This is Nishi"]) prediction !pip install bert-extractive-summarizer !pip install wikipedia
"do_lower_case": True, "silent": True, "reprocess_input_data": True } labels = [ 'B-ORGANIZATION', 'I-ORGANIZATION', 'B-SUBJECT', 'I-SUBJECT', 'B-OBJECT', 'I-OBJECT', 'B-CODEX', 'I-CODEX', 'B-NUMBER', 'I-NUMBER', 'B-LAWFACE', 'I-LAWFACE', 'B-PHYSFACE', 'I-PHYSFACE', 'B-REGISTRY', 'I-REGISTRY', 'B-SIZES', 'I-SIZES', 'B-DATE', 'I-DATE', 'B-SPARESUBJECT', 'I-SPARESUBJECT', 'B-CADASTRE', 'I-CADASTRE', 'B-ADDRESS', 'I-ADDRESS', 'O' ] model = NERModel(model_type='distilbert', model_name='outputs/', args=args, use_cuda=False, labels=labels) async def main(loop=None): ner_controller = NERController(model) application = Application(middlewares=[middleware.logging], logger=logger) application.router.add_post('/api/ner', ner_controller.post) runner = AppRunner(application) await runner.setup() site = TCPSite(runner, environ['APP_HOST'], int(environ['APP_PORT'])) await site.start()
class NerModel: def __init__(self, modelname="", dataset=None, use_saved_model=False): self.dataset = dataset #labels_list = ["O", "B-ACT", "I-ACT", "B-OBJ", "I-OBJ", "B-VAL", "I-VAL", "B-VAR", "I-VAR"] #labels_list = dataset.get_labels_list() labels_list = dataset['labels_list'] output_dir = "outputs_{}".format(modelname) # Create a NERModel model_args = { 'output_dir': output_dir, 'overwrite_output_dir': True, 'reprocess_input_data': True, 'save_eval_checkpoints': False, 'save_steps': -1, 'save_model_every_epoch': False, 'train_batch_size': 10, # 10 'num_train_epochs': 10, # 5 'max_seq_length': 256, 'gradient_accumulation_steps': 8, 'labels_list': labels_list } if use_saved_model: self.model = NERModel("bert", output_dir, use_cuda=False, args=model_args) else: self.model = NERModel("bert", "bert-base-cased", use_cuda=False, args=model_args) # args={"overwrite_output_dir": True, "reprocess_input_data": True} def train(self): # # Train the model if self.dataset: self.model.train_model(self.dataset['train']) else: raise Exception("dataset is None") def eval(self): # # Evaluate the model if self.dataset: result, model_outputs, predictions = self.model.eval_model(self.dataset['val']) print("Evaluation result:", result) else: raise Exception("dataset is None") def simple_test(self): # Predictions on arbitary text strings sentences = ["Some arbitary sentence", "Simple Transformers sentence"] predictions, raw_outputs = self.model.predict(sentences) print(predictions) # More detailed preditctions for n, (preds, outs) in enumerate(zip(predictions, raw_outputs)): print("\n___________________________") print("Sentence: ", sentences[n]) for pred, out in zip(preds, outs): key = list(pred.keys())[0] new_out = out[key] preds = list(softmax(np.mean(new_out, axis=0))) print(key, pred[key], preds[np.argmax(preds)], preds) def predict(self, sentences): predictions, raw_outputs = self.model.predict(sentences) return predictions
"use_early_stopping": True, "early_stopping_patience": 4, "evaluate_during_training": True, "reprocess_input_data": False, "use_cached_eval_features": True, "fp16": False, "num_train_epochs": 10, "evaluate_during_training_steps": 10000, "train_batch_size": 32, 'cross_entropy_ignore_index': 0, 'classification_report': True } model = NERModel("electra", "discriminator_trained/discriminator_model", args=train_args, labels=labels, use_cuda=True, crf=True) # Train the model model.train_model(train_file, eval_data=eval_file) # Evaluate the model test_file = 'data_set/test.ner.small.txt' result, model_outputs, predictions = model.eval_model(train_file) print(result) # from transformers import ElectraTokenizer, ElectraForPreTraining # model_name = r'D:\git_learn\simpletransformers\examples\language_model\discriminator_trained\discriminator_model' # model = ElectraForPreTraining.from_pretrained(model_name)
transformers_logger.setLevel(logging.WARNING) # Creating train_df and eval_df for demonstration train_data = [ [0, 'Simple', 'B-MISC'], [0, 'Transformers', 'I-MISC'], [0, 'started', 'O'], [1, 'with', 'O'], [0, 'text', 'O'], [0, 'classification', 'B-MISC'], [1, 'Simple', 'B-MISC'], [1, 'Transformers', 'I-MISC'], [1, 'can', 'O'], [1, 'now', 'O'], [1, 'perform', 'O'], [1, 'NER', 'B-MISC'] ] train_df = pd.DataFrame(train_data, columns=['sentence_id', 'words', 'labels']) eval_data = [ [0, 'Simple', 'B-MISC'], [0, 'Transformers', 'I-MISC'], [0, 'was', 'O'], [1, 'built', 'O'], [1, 'for', 'O'], [0, 'text', 'O'], [0, 'classification', 'B-MISC'], [1, 'Simple', 'B-MISC'], [1, 'Transformers', 'I-MISC'], [1, 'then', 'O'], [1, 'expanded', 'O'], [1, 'to', 'O'], [1, 'perform', 'O'], [1, 'NER', 'B-MISC'] ] eval_df = pd.DataFrame(eval_data, columns=['sentence_id', 'words', 'labels']) # print(train_df) # print(eval_df) # Create a NERModel model = NERModel('bert', 'bert-base-cased', args={'overwrite_output_dir': True, 'reprocess_input_data': True}) # Train the model model.train_model(train_df) # # Evaluate the model result, model_outputs, predictions = model.eval_model(eval_df) print(result, predictions) # # Predictions on arbitary text strings predictions, raw_outputs = model.predict(["Simple Transformers started with text classification"]) print(predictions) # print(raw_outputs)
class RestorePuncts: def __init__(self, wrds_per_pred=250): self.wrds_per_pred = wrds_per_pred self.overlap_wrds = 30 self.valid_labels = [ 'OU', 'OO', '.O', '!O', ',O', '.U', '!U', ',U', ':O', ';O', ':U', "'O", '-O', '?O', '?U' ] self.model = NERModel("bert", "felflare/bert-restore-punctuation", labels=self.valid_labels, args={ "silent": True, "max_seq_length": 512 }) # use_cuda isnt working and this hack seems to load the model correctly to the gpu self.model.device = torch.device("cuda:1") # dummy punctuate to load the model onto gpu self.punctuate("hello how are you") def punctuate(self, text: str, batch_size: int = 32, lang: str = ''): """ Performs punctuation restoration on arbitrarily large text. Detects if input is not English, if non-English was detected terminates predictions. Overrride by supplying `lang='en'` Args: - text (str): Text to punctuate, can be few words to as large as you want. - lang (str): Explicit language of input text. """ #if not lang and len(text) > 10: # lang = detect(text) #if lang != 'en': # raise Exception(F"""Non English text detected. Restore Punctuation works only for English. # If you are certain the input is English, pass argument lang='en' to this function. # Punctuate received: {text}""") def chunks(L, n): return [L[x:x + n] for x in range(0, len(L), n)] # plit up large text into bert digestable chunks splits = self.split_on_toks(text, self.wrds_per_pred, self.overlap_wrds) texts = [i["text"] for i in splits] batches = chunks(texts, batch_size) preds_lst = [] for batch in batches: batch_preds, _ = self.model.predict(batch) preds_lst.extend(batch_preds) # predict slices # full_preds_lst contains tuple of labels and logits #full_preds_lst = [self.predict(i['text']) for i in splits] # extract predictions, and discard logits #preds_lst = [i[0][0] for i in full_preds_lst] # join text slices combined_preds = self.combine_results(text, preds_lst) # create punctuated prediction punct_text = self.punctuate_texts(combined_preds) return punct_text def predict(self, input_slice): """ Passes the unpunctuated text to the model for punctuation. """ predictions, raw_outputs = self.model.predict([input_slice]) return predictions, raw_outputs @staticmethod def split_on_toks(text, length, overlap): """ Splits text into predefined slices of overlapping text with indexes (offsets) that tie-back to original text. This is done to bypass 512 token limit on transformer models by sequentially feeding chunks of < 512 toks. Example output: [{...}, {"text": "...", 'start_idx': 31354, 'end_idx': 32648}, {...}] """ wrds = text.replace('\n', ' ').split(" ") resp = [] lst_chunk_idx = 0 i = 0 while True: # words in the chunk and the overlapping portion wrds_len = wrds[(length * i):(length * (i + 1))] wrds_ovlp = wrds[(length * (i + 1)):((length * (i + 1)) + overlap)] wrds_split = wrds_len + wrds_ovlp # Break loop if no more words if not wrds_split: break wrds_str = " ".join(wrds_split) nxt_chunk_start_idx = len(" ".join(wrds_len)) lst_char_idx = len(" ".join(wrds_split)) resp_obj = { "text": wrds_str, "start_idx": lst_chunk_idx, "end_idx": lst_char_idx + lst_chunk_idx, } resp.append(resp_obj) lst_chunk_idx += nxt_chunk_start_idx + 1 i += 1 logging.info(f"Sliced transcript into {len(resp)} slices.") return resp @staticmethod def combine_results(full_text: str, text_slices): """ Given a full text and predictions of each slice combines predictions into a single text again. Performs validataion wether text was combined correctly """ split_full_text = full_text.replace('\n', ' ').split(" ") split_full_text = [i for i in split_full_text if i] split_full_text_len = len(split_full_text) output_text = [] index = 0 if len(text_slices[-1]) <= 3 and len(text_slices) > 1: text_slices = text_slices[:-1] for _slice in text_slices: slice_wrds = len(_slice) for ix, wrd in enumerate(_slice): # print(index, "|", str(list(wrd.keys())[0]), "|", split_full_text[index]) if index == split_full_text_len: break if split_full_text[index] == str(list(wrd.keys())[0]) and \ ix <= slice_wrds - 3 and text_slices[-1] != _slice: index += 1 pred_item_tuple = list(wrd.items())[0] output_text.append(pred_item_tuple) elif split_full_text[index] == str(list( wrd.keys())[0]) and text_slices[-1] == _slice: index += 1 pred_item_tuple = list(wrd.items())[0] output_text.append(pred_item_tuple) assert [i[0] for i in output_text] == split_full_text return output_text @staticmethod def punctuate_texts(full_pred: list): """ Given a list of Predictions from the model, applies the predictions to text, thus punctuating it. """ punct_resp = "" for i in full_pred: word, label = i if label[-1] == "U": punct_wrd = word.capitalize() else: punct_wrd = word if label[0] != "O": punct_wrd += label[0] punct_resp += punct_wrd + " " punct_resp = punct_resp.strip() # Append trailing period if doesnt exist. if punct_resp[-1].isalnum(): punct_resp += "." return punct_resp
[0, "built", "O"], [0, "for", "O"], [0, "text", "O"], [0, "classification", "B-MISC"], [1, "Simple", "B-MISC"], [1, "Transformers", "I-MISC"], [1, "then", "O"], [1, "expanded", "O"], [1, "to", "O"], [1, "perform", "O"], [1, "NER", "B-MISC"], ] eval_df = pd.DataFrame(eval_data, columns=["sentence_id", "words", "labels"]) # Create a NERModel model = NERModel("bert", "bert-base-cased", args={"overwrite_output_dir": True, "reprocess_input_data": True}) # # Train the model # model.train_model(train_df) # # Evaluate the model # result, model_outputs, predictions = model.eval_model(eval_df) # Predictions on arbitary text strings sentences = ["Some arbitary sentence", "Simple Transformers sentence"] predictions, raw_outputs = model.predict(sentences) print(predictions) # More detailed preditctions
# Create a NERModel model_args = { 'overwrite_output_dir': True, 'reprocess_input_data': True, 'save_eval_checkpoints': False, 'save_steps': -1, 'save_model_every_epoch': False, 'train_batch_size': 10, 'num_train_epochs': 100, # 5 'max_seq_length': 256, 'gradient_accumulation_steps': 8 } model = NERModel("bert", "bert-base-cased", use_cuda=False, args=model_args) # args={"overwrite_output_dir": True, "reprocess_input_data": True} train_df = eval_df # # Train the model model.train_model(train_df) # # Evaluate the model result, model_outputs, predictions = model.eval_model(eval_df) # Predictions on arbitary text strings sentences = ["Some arbitary sentence", "Simple Transformers sentence"] predictions, raw_outputs = model.predict(sentences)
if __name__ == '__main__': train_df = txt_to_df("/Users/mutaz/Desktop/Mutaz Thesis bert/Data/raw/train.txt") test_df = txt_to_df("/Users/mutaz/Desktop/Mutaz Thesis bert/Data/raw/test.txt") test_df.reset_index(drop=True, inplace=True) # test_df["Sentence #"] = test_df["Sentence #"].values logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) train_df = pd.DataFrame(train_df.values, columns=['sentence_id', 'words', 'labels']) test_df = pd.DataFrame(test_df.values, columns=['sentence_id', 'words', 'labels']) model_used = "/Users/mutaz/Desktop/Mutaz Thesis bert/hotel_reviews" mm, epp, lrr, f11, results = [], [], [], [], {} aspect_model = NERModel("bert", "{}".format(model_used), labels=["B-A", "I-A", "O"], args={"save_eval_checkpoints": False, "save_steps": -1, 'overwrite_output_dir': True, "save_model_every_epoch": False, 'reprocess_input_data': True, "train_batch_size": 5, 'num_train_epochs': 2, "gradient_accumulation_steps": 5, "output_dir": "/Users/mutaz/Desktop/Mutaz Thesis bert" , "learning_rate": 0.0001}, use_cuda=False) # aspect_model.train_model(train_df) test_df = test_df.iloc[:100] a = test_df.groupby("sentence_id") gps = [a.get_group(key) for key, item in a] actual = [list(g.labels.values) for g in gps] sentences = [" ".join(i) for i in [[w for w in g.words.values] for g in gps]] predictions, raw_outputs = aspect_model.predict(sentences) pred = [[tag__ for word_ in s for word__, tag__ in word_.items()] for s in [sentence_ for sentence_ in predictions]] clear_output() f1 = f1_score(actual, pred, mode='strict', scheme=IOB2) print(f1)
line = line.split('\t') if len(line) == 3: eval_data.append(line) eval_df = pd.DataFrame(eval_data, columns=['sentence_id', 'words', 'labels']) # test = eval_df.groupby('sentence_id').count()[['words']] # print(test.query("words > 128")) # Create a NERModel model = NERModel('albert', 'albert-base-v2', use_cuda=False, args={ 'overwrite_output_dir': True, 'reprocess_input_data': True }) # # Train the model model.train_model(train_df) # Evaluate the model result, model_outputs, predictions = model.eval_model(eval_df) # Predictions on arbitary text strings predictions, raw_outputs = model.predict( ["Peter er lige begyndt til badminton."]) print(predictions)
"evaluate_during_training": False, # "process_count": cpu_count() - 2 if cpu_count() > 2 else 1, "process_count": 2, "n_gpu": 1, } from simpletransformers.ner import NERModel import logging logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) # Create a NERModel model = NERModel('bert', 'bert-base-cased', use_cuda=True, labels=label_list, args=args) model.train_model(train_df) sentences = [ "Mary M. Mrdutt, M.D., from the Baylor Scott & White Memorial Hospital in Temple, Texas, and colleagues prospectively measured frailty in elective surgery patients in a health care system.", "To be clear, there are currently no legal requirements for any cosmetic manufacturer marketing products to American consumers to test their products for safety" ] predictions, raw_outputs = model.predict(sentences) print(predictions)
def main(sentence): """Predicts NER labels""" model = NERModel('bert', 'outputs/', use_cuda=False) # predictions, raw_outputs = model.predict([sentence]) print(predictions)
[0, "text", "O"], [0, "classification", "B-MISC"], [1, "Simple", "B-MISC"], [1, "Transformers", "I-MISC"], [1, "then", "O"], [1, "expanded", "O"], [1, "to", "O"], [1, "perform", "O"], [1, "NER", "B-MISC"], ] eval_df = pd.DataFrame(eval_data, columns=["sentence_id", "words", "labels"]) # Create a NERModel model = NERModel("bert", "bert-base-cased", args={ "overwrite_output_dir": True, "reprocess_input_data": True }) # Train the model model.train_model(train_df) # Evaluate the model result, model_outputs, predictions = model.eval_model(eval_df) # Predictions on arbitary text strings predictions, raw_outputs = model.predict(["Some arbitary sentence"]) print(predictions)
class DependencyModel: def __init__(self, ner_path: str, deps_path: str): """Create/Load a new DependencyModel Args: ner_path (str): directory of NER model. (if not exists, create a new model) deps_path (str): directory of classification model. """ self.ner_path = ner_path try: self.ner_model = NERModel("distilbert", ner_path, use_cuda=False) except: self.ner_model = NERModel("distilbert", "distilbert-base-uncased", labels=custom_labels, use_cuda=False) self.deps_model = ClassifyModel(deps_path) def train_ner(self, ner_data: Iterator[Tuple[str, str, str]] ): """Train the NER model Args: ner_data (Iterator[Tuple[str, str, str]]): iterator of tuple of (sentence, entity1, entity2). """ train_data = [] tokenizer = nltk.RegexpTokenizer(r"[A-Za-z']+") for i, (sentence, e1, e2) in enumerate(ner_data): for word in tokenizer.tokenize(sentence): label = "O" if word.lower() == e1.lower() or word.lower() == e2.lower(): label = "E" train_data.append((i, word, label)) train_data = pd.DataFrame(train_data) train_data.columns = ["sentence_id", "words", "labels"] self.ner_model.train(train_data, output_dir=self.ner_path) def train_deps(self, train_data: Iterator[Tuple[str, Tuple[str, str, str]]]): """Train the dependency tree path classification model. Args: train_data (Iterator[Tuple[str, Tuple[str, str, str]]]): iterator of (sentence, (relation, entity1, entity2)) """ classify_data: List[str, str] = [] for sentence, (relation, e1, e2) in train_data: tree = DepTree(sentence) classify_data.append((" ".join(tree.shortest_path(e1,e2)), relation_list.index(relation))) self.deps_model.train(train_data) def train(self, train_data: Iterator[Tuple[str, Tuple[str, str, str]]]): """Train both the NER model AND the classification model. Args: train_data (Iterator[Tuple[str, Tuple[str, str, str]]]): iterator of (sentence, (relation, entity1, entity2)) """ train_data = list(train_data) self.train_ner([ (sent, e1, e2) for sent, (rela, e1, e2) in train_data ]) self.train_deps(train_data) def predict(self, data: Iterable[str]) -> List[Tuple[str, Tuple[str, str]]]: """Predict the relation extracted from input sentences. Args: data (Iterable[str]): list of input sentences. Returns: List[Tuple[str, Tuple[str, str]]]: list of (relation, (entity1, entity2)) """ data = list(data) predictions, raws = self.ner_model.predict(data) predict_data = [] entities = [] for raw, sentence in zip(raws, data): [e1, e2] = get_entity(raw) tree = DepTree(sentence) entities.append((e1, e2)) predict_data.append(" ".join(tree.shortest_path(e1,e2))) return list(zip(self.deps_model.predict(predict_data), entities))
class BioAnalysis: def __init__(self, train_file="./data/train.tsv", dev_file="./data/dev.tsv", test_file="./data/test.tsv"): self.train_data = file_opener(train_file) self.dev_data = file_opener(dev_file) self.test_data = file_opener(test_file) self.test_data.pop(192) self.crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.09684573395986483, c2=0.0800864058815976, max_iterations=100, all_possible_transitions=True) self.dev_predicted = None self.test_predicted = None self.dev_labels = None self.test_labels = None self.labels = [ 'B-Abiotic_Entity', 'B-Aggregate_Biotic_Abiotic_Entity', 'B-Biotic_Entity', 'B-Eventuality', 'B-Location', 'B-Quality', 'B-Time', 'B-Unit', 'B-Value', 'I-Abiotic_Entity', 'I-Aggregate_Biotic_Abiotic_Entity', 'I-Biotic_Entity', 'I-Eventuality', 'I-Location', 'I-Quality', 'I-Time', 'I-Unit', 'I-Value', 'O' ] self.roberta_nel_model = None self.roberta_nel_dev_eval = None self.roberta_nel_test_eval = None self.roberta_nel_dev_links = None self.roberta_nel_test_links = None self.roberta_nel_train_data, _ = get_roberta_nel_data(self.train_data) self.roberta_nel_dev_data, self.roberta_nel_dev_spans = get_roberta_nel_data( self.dev_data) self.roberta_nel_test_data, self.roberta_nel_test_spans = get_roberta_nel_data( self.test_data) self.roberta_ner_model = None self.roberta_ner_dev_eval = None self.roberta_ner_test_eval = None self.roberta_ner_train_data = get_roberta_ner_data(self.train_data) self.roberta_ner_dev_data = get_roberta_ner_data(self.dev_data) self.roberta_ner_test_data = get_roberta_ner_data(self.test_data) def crf_fit(self): self.crf.fit(*get_features_labels(self.train_data)) def crf_predict(self): dev_feat, self.dev_labels = get_features_labels(self.dev_data) test_feat, self.test_labels = get_features_labels(self.test_data) self.dev_predicted = self.crf.predict(dev_feat) self.test_predicted = self.crf.predict(test_feat) def crf_evaluate(self, verbose=False, labels=False): if labels: lab = labels else: lab = self.crf.classes_ lab.remove("O") with warnings.catch_warnings(): warnings.simplefilter("ignore") print("Dev Results\n===========") dev_args = (self.dev_labels, self.dev_predicted) kwargs = {"average": "weighted", "labels": lab} if verbose: print("Precision:", metrics.flat_precision_score(*dev_args, **kwargs)) print("Recall:", metrics.flat_recall_score(*dev_args, **kwargs)) print("F1:", metrics.flat_f1_score(*dev_args, **kwargs)) test_args = (self.test_labels, self.test_predicted) print("\nTest Results\n============") if verbose: print("Precision:", metrics.flat_precision_score(*test_args, **kwargs)) print("Recall:", metrics.flat_recall_score(*test_args, **kwargs)) print("F1:", metrics.flat_f1_score(*test_args, **kwargs)) def roberta_nel_train(self): train_df = pd.DataFrame(self.roberta_nel_train_data) self.roberta_nel_model = ClassificationModel( "roberta", "roberta-base", args={ "num_train_epochs": 3, "overwrite_output_dir": True, "output_dir": "nel_outputs/" }) self.roberta_nel_model.train_model(train_df) def roberta_nel_eval(self): dev_df = pd.DataFrame(self.roberta_nel_dev_data) test_df = pd.DataFrame(self.roberta_nel_test_data) self.roberta_nel_dev_eval = self.roberta_nel_model.eval_model( dev_df, acc=f1_score) self.roberta_nel_test_eval = self.roberta_nel_model.eval_model( test_df, acc=f1_score) print("Dev NEL Results\n===========") print("F1:", self.roberta_nel_dev_eval[0]["acc"]) print("\nTest NEL Results\n============") print("F1:", self.roberta_nel_test_eval[0]["acc"]) def roberta_nel_load_model(self): self.roberta_nel_model = ClassificationModel( "roberta", "nel_outputs/", args={"num_train_epochs": 3}) def roberta_ner_train(self): train_df = pd.DataFrame(self.roberta_ner_train_data, columns=['sentence_id', 'words', 'labels']) self.roberta_ner_model = NERModel("roberta", "roberta-base", labels=self.labels, args={ "num_train_epochs": 3, "overwrite_output_dir": True, "output_dir": "ner_outputs/" }) self.roberta_ner_model.train_model(train_df) def roberta_ner_eval(self): dev_df = pd.DataFrame(self.roberta_ner_dev_data, columns=['sentence_id', 'words', 'labels']) test_df = pd.DataFrame(self.roberta_ner_test_data, columns=['sentence_id', 'words', 'labels']) self.roberta_ner_dev_eval = self.roberta_ner_model.eval_model( dev_df, "./ner_outputs/") self.roberta_ner_test_eval = self.roberta_ner_model.eval_model( test_df, "./ner_outputs/") print("Dev NER Results\n===========") print("Precision:", self.roberta_ner_dev_eval[0]["precision"]) print("Recall:", self.roberta_ner_dev_eval[0]["recall"]) print("F1:", self.roberta_ner_dev_eval[0]["f1_score"]) print("\nTest NER Results\n============") print("Precision:", self.roberta_ner_test_eval[0]["precision"]) print("Recall:", self.roberta_ner_test_eval[0]["recall"]) print("F1:", self.roberta_ner_test_eval[0]["f1_score"]) def roberta_ner_load_model(self): self.roberta_ner_model = NERModel("roberta", "ner_outputs/", labels=self.labels, args={"num_train_epochs": 3}) def roberta_ner_nel_pipeline(self): try: self.roberta_ner_load_model() except OSError: self.roberta_ner_train() self.roberta_ner_eval() roberta_dev_phrases = deepcopy(self.dev_data) for ii in range(len(roberta_dev_phrases)): for jj in range(len(roberta_dev_phrases[ii])): roberta_dev_phrases[ii][jj] = list(roberta_dev_phrases[ii][jj]) roberta_dev_phrases[ii][jj][2] = self.roberta_ner_dev_eval[2][ ii][jj] roberta_dev_phrases, roberta_dev_spans = get_roberta_nel_data( roberta_dev_phrases) roberta_test_phrases = deepcopy(self.test_data) for ii in range(len(roberta_test_phrases)): for jj in range(len(roberta_test_phrases[ii])): roberta_test_phrases[ii][jj] = list( roberta_test_phrases[ii][jj]) roberta_test_phrases[ii][jj][2] = self.roberta_ner_test_eval[ 2][ii][jj] roberta_test_phrases, roberta_test_spans = get_roberta_nel_data( roberta_test_phrases) try: self.roberta_nel_load_model() except OSError: self.roberta_nel_train() roberta_dev_prediction = self.roberta_nel_model.predict( [x[0] for x in roberta_dev_phrases])[0] roberta_test_prediction = self.roberta_nel_model.predict( [x[0] for x in roberta_test_phrases])[0] roberta_dev_actual = [x[1] for x in self.roberta_nel_dev_data] roberta_test_actual = [x[1] for x in self.roberta_nel_test_data] dev_prediction = transform_nel_results(roberta_dev_prediction, roberta_dev_spans) dev_actual = transform_nel_results(roberta_dev_actual, self.roberta_nel_dev_spans) dev_actual, dev_prediction = resolve_diff(dev_actual, dev_prediction) test_prediction = transform_nel_results(roberta_test_prediction, roberta_test_spans) test_actual = transform_nel_results(roberta_test_actual, self.roberta_nel_test_spans) test_actual, test_prediction = resolve_diff(test_actual, test_prediction) print("Dev NEL Combined Results\n===========") print("F1:", f1_score(dev_actual, dev_prediction)) print("Test NEL Combined Results\n===========") print("F1:", f1_score(test_actual, test_prediction)) dev_output = list( zip([x[0] for x in roberta_dev_phrases], roberta_dev_prediction)) self.roberta_nel_dev_links = get_links(dev_output) test_output = list( zip([x[0] for x in roberta_test_phrases], roberta_test_prediction)) self.roberta_nel_test_links = get_links(test_output)
labels = f.readlines() labels = [i.strip() for i in labels] train_args = { "output_dir": "ner_output", "overwrite_output_dir": True, "use_multiprocessing": False, "save_steps": 0, "use_early_stopping": True, "early_stopping_patience": 4, "evaluate_during_training": True, "reprocess_input_data": False, "use_cached_eval_features": True, "fp16": False, "num_train_epochs": 10, "evaluate_during_training_steps": 10000, "train_batch_size": 32, 'cross_entropy_ignore_index': 0, 'classification_report': True } model = NERModel("electra", "ner_output/checkpoint-150000", args=train_args, labels=labels, use_cuda=True, crf=True) result = model.predict([list('发烧头痛3天'), list('见盲肠底,升结肠近肝曲')], split_on_space=False) print(result)
def roberta_ner_load_model(self): self.roberta_ner_model = NERModel("roberta", "ner_outputs/", labels=self.labels, args={"num_train_epochs": 3})
[1, 'perform', 'O'], [1, 'NER', 'B-MISC']] train_df = pd.DataFrame(train_data, columns=['sentence_id', 'words', 'labels']) eval_data = [[0, 'Simple', 'B-MISC'], [0, 'Transformers', 'I-MISC'], [0, 'was', 'O'], [1, 'built', 'O'], [1, 'for', 'O'], [0, 'text', 'O'], [0, 'classification', 'B-MISC'], [1, 'Simple', 'B-MISC'], [1, 'Transformers', 'I-MISC'], [1, 'then', 'O'], [1, 'expanded', 'O'], [1, 'to', 'O'], [1, 'perform', 'O'], [1, 'NER', 'B-MISC']] eval_df = pd.DataFrame(eval_data, columns=['sentence_id', 'words', 'labels']) # Create a NERModel model = NERModel('bert', 'bert-base-cased', args={ 'use_multiprocessing': False, 'overwrite_output_dir': True, 'reprocess_input_data': True }, use_cuda=False) # Train the model model.train_model(train_df) # Evaluate the model result, model_outputs, predictions = model.eval_model(eval_df) # Predictions on arbitary text strings predictions, raw_outputs = model.predict(["Some arbitary sentence"]) print(predictions)