def predict(self, input_dir, output_dir, rw_type, input_format, chunk_len=100, test_scores = False, output_confidence=False, special_model_path=None): """ tags each file in the input directory (txt or tsv files) and writes the results to output_dir. Also adds a folder "result_stats" with runtime information to the output_dir tsv files must have at least the columns 'tok' and 'sentstart' :param input_dir: string value: path to input directory :param output_dir: string value: path to output directory :param rw_type: string value: direct, indirect, freeIndirect or reported :param input_format: string value: txt or tsv :param chunk_len: :return: """ # time the prediction start_time = datetime.datetime.now().replace(microsecond=0) # create a subdir for testing and overview information in the outputdir result_subdir = "result_stats" if not os.path.exists(os.path.join(output_dir, result_subdir)): os.makedirs(os.path.join(output_dir, result_subdir)) # load the model # determine the current script path curr_path = os.path.dirname(os.path.abspath(__file__)) if special_model_path is None: model_path = os.path.join(curr_path, "models", rw_type, "final-model.pt") else: model_path = os.path.join(curr_path, "models", special_model_path, "final-model.pt") if not os.path.exists(model_path): logging.warning("Predicting {} aborted. Model not found at path '{}'. Please download a model and put it into " "the appropriate directory. The model file must be named final-model.pt.".format(rw_type, model_path)) else: self.logger.info("loading model {}".format(model_path)) model = SequenceTagger.load(model_path) self.logger.info("model loaded") # if test mode, collect score data (initialize in any case) score_dict = {"file": [], "f1":[], "precision": [], "recall": []} all_predictions_df = pd.DataFrame() input_files = [x for x in os.listdir(input_dir)] for file in input_files: resfile_name = re.sub("\..+$", ".tsv", file) self.logger.info("predicting {}".format(file)) # read the file and convert to dataframe if input_format == "txt": data = self.convert_txtfile_to_dateframe(os.path.join(input_dir, file)) else: data = pd.read_csv(os.path.join(input_dir, file), sep="\t", quoting=3, encoding="utf-8", na_values=[]) # check for tok column: if "tok" not in data.columns: self.logger.warning("Column 'tok' is missing in file {}. File will be skipped.".format(file)) else: if "sentstart" not in data.columns: self.logger.warning("Column 'sentstart' is missing in file {}. Will be added with default values (all 'no').".format(file)) data["sentstart"] = ["no"]*len(data) self.logger.debug("TEST: data head:\n {}".format(data.head(10))) # create sentlist (based on max chunk length) sent_list = self.create_sentlist_from_file_batchmax(data, maxlen=chunk_len, compare_column="NaN") # predict res_dict = {"tok": [], rw_type + "_pred": [], rw_type + "_conf": []} for sent in sent_list: model.predict(sent) pred_list = [x["type"] for x in sent.to_dict("cat")["entities"]] res_dict["tok"].extend([x["text"] for x in sent.to_dict("cat")["entities"]]) res_dict[rw_type + "_conf"].extend([x["confidence"] for x in sent.to_dict("cat")["entities"]]) res_dict[rw_type + "_pred"].extend(pred_list) pred_df = pd.DataFrame(res_dict) # create output # if there is a missmatch in file length after prediction, still save the results if (len(data) != len(pred_df)): self.logger.warning("File length changed when predicting for file {} (before: {}, after: {})\n" "Result file will be saved with prefix 'warn_'; additional columns are lost." .format(file, len(data), len(pred_df))) pred_df.to_csv(os.path.join(output_dir, "warn_" + resfile_name), index=False, sep="\t") # if everything is okay, add the new column(s) to the original data and save else: if output_confidence: data[rw_type + "_conf"] = pred_df[rw_type + "_conf"] data[rw_type + "_pred"] = pred_df[rw_type + "_pred"] data.to_csv(os.path.join(output_dir, resfile_name), index=False, sep="\t", encoding="utf-8") # calculate the testscores: if test_scores: self.logger.info("Calculate scores for {}".format(file)) if rw_type in data.columns and rw_type + "_pred" in data.columns: data, f1, prec, rec = self.calculate_scores(data, rw_type) score_dict["file"].append(file) score_dict["f1"].append(f1) score_dict["precision"].append(prec) score_dict["recall"].append(rec) all_predictions_df = all_predictions_df.append(data) else: self.logger.warning("Skipping test scores for file {}: Missing column {} and/or {}".format(file, rw_type, rw_type + "_pred")) end_time = datetime.datetime.now().replace(microsecond=0) # write an overview file when the process is finished res_text = "RW Tagger (predict): Model {}\n" \ "Predict time:\nstart: {}nend:{}\ntotal: {}" \ .format(model_path, start_time, end_time, end_time - start_time) # if in test mode, calculate the final scores (for all the data) and save the test score df if test_scores: self.logger.info("Calculate total scores") if len(all_predictions_df) > 0: self.logger.debug("all_predictions_len: {}".format(len(all_predictions_df))) all_predictions_df, f1, prec, rec = self.calculate_scores(all_predictions_df, rw_type) score_dict["file"].append("total") score_dict["f1"].append(f1) score_dict["precision"].append(prec) score_dict["recall"].append(rec) score_df = pd.DataFrame(score_dict) score_df.to_csv(os.path.join(output_dir, result_subdir, rw_type + "_test_scores.tsv"), index=False, sep="\t", encoding="utf-8") res_text += "\nTotal test scores (for detailed scores see {}_test_scores.tsv):\n" \ "f1: {}, precision: {}, recall: {}".format(rw_type, f1, prec, rec) self.logger.info("Total scores for {}: f1: {}, precision: {}, recall: {}".format(rw_type, f1, prec, rec)) with open(os.path.join(output_dir, result_subdir, rw_type + "_overview.txt"), "w", encoding="utf-8") as f: f.write(res_text)
# comment in these lines to use contextual string embeddings # # CharLMEmbeddings('news-forward'), # # CharLMEmbeddings('news-backward'), ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) # initialize sequence tagger from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) # initialize trainer from flair.trainers.sequence_tagger_trainer import SequenceTaggerTrainer trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger, corpus, test_mode=True) trainer.train('resources/taggers/example-ner', learning_rate=0.1, mini_batch_size=32, max_epochs=20)
# comment in this line to use character embeddings # CharacterEmbeddings(), # comment in these lines to use flair embeddings # FlairEmbeddings('news-forward'), # FlairEmbeddings('news-backward'), ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) # 5. initialize sequence tagger from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) # 6. initialize trainer from flair.trainers import ModelTrainer # trainer: ModelTrainer = ModelTrainer(tagger, corpus) checkpoint = 'resources/taggers/ner_with_random_dp_1/checkpoint.pt' trainer = ModelTrainer.load_checkpoint(checkpoint, corpus) # 7. start training trainer.train('resources/taggers/ner_with_random_dp_1', learning_rate=0.1, mini_batch_size=32,
help='Use gpu/cpu, put "cuda" if gpu and "cpu" if cpu') args = parser.parse_args() input_folder=args.input model_file=args.model gpu_type=args.gpu flair.device = torch.device(gpu_type) from flair.data import Sentence from flair.models import SequenceTagger from tqdm import tqdm import torch from flair.data import Corpus from flair.datasets import ColumnCorpus from flair.trainers import ModelTrainer from flair.datasets import ColumnCorpus from flair.models import SequenceTagger #Change this line if you have POS tags in your data, eg.- {0: 'text', 1:'pos', 2:'ner'} columns = {0: 'text',1: 'ner'} corpus: ColumnCorpus = ColumnCorpus(input_folder, column_format={0: 'text',1: 'ner'}) tagger = SequenceTagger.load(model_file) print("Dev set results") result, _ = tagger.evaluate(corpus.dev) print(result.detailed_results) print("Test set results") result, _ = tagger.evaluate(corpus.test) print(result.detailed_results)
from tensorflow.python.keras.backend import set_session from tensorflow.python.keras.models import load_model #sess = tf.Session() #graph = tf.get_default_graph() # IMPORTANT: models have to be loaded AFTER SETTING THE SESSION for keras! # Otherwise, their weights will be unavailable in the threads after the session there has been set #set_session(sess) print(datetime.datetime.now(), 'loading Bi-LSTM.h5') cat_model = load_model('Bi-LSTM.h5') print(datetime.datetime.now(), 'Category model loaded') ner_model = SequenceTagger.load('checkpoint.pt') print(datetime.datetime.now(), 'NER model loaded') with open('./token.pkl','rb') as infile: token = pickle.load(infile) with open('./cols.pkl','rb') as infile: cols = pickle.load(infile) app = Flask(__name__) def predict_category(sentence): seq_x = sequence.pad_sequences(token.texts_to_sequences([sentence]), maxlen=60)
from flair.data import Sentence from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger.load("ner") sentence: Sentence = Sentence("George Washington went to Washington .") tagger.predict(sentence) print("Analysing THE sentence %s" % sentence) print("\nThe following NER tags are found: \n") print(sentence.to_tagged_string())
from flair.data import Sentence from flair.models import SequenceTagger # load the NER tagger tagger = SequenceTagger.load('ner') def parseXML(): import glob d = "/home/xtof/arxiv/" fs = glob.glob(d + "*.xml") for f in fs: print("FICH ", f) with open(f, "r") as ff: indesc = False abs = "" for l in ff: if '<dc:description>' in l: i = l.find('<dc:description>') l = l[i + 16:] indesc = True if indesc: abs += l.strip() + " " if '</dc:description>' in l: i = abs.find('</dc:description>') abs = abs[:i].strip() indesc = False # print(abs) sentence = Sentence(abs) tagger.predict(sentence) for entity in sentence.get_spans('ner'): print("NER", entity)
import dash_bootstrap_components as dbc import dash_core_components as dcc import dash_html_components as html from flair.models import SequenceTagger from components.data_ETL import load_text, create_upload_tab_html_output # Env variables PSEUDO_REST_API_URL = os.environ.get('PSEUDO_REST_API_URL', '') PSEUDO_MODEL_PATH = os.environ.get('PSEUDO_MODEL_PATH', '') TAGGER = None if not PSEUDO_REST_API_URL and not PSEUDO_MODEL_PATH: print("Neither the pseudonymization service nor a trained model are available. We cannot continue :(") exit(1) elif (not PSEUDO_REST_API_URL and PSEUDO_MODEL_PATH) or (PSEUDO_MODEL_PATH and PSEUDO_REST_API_URL): TAGGER = SequenceTagger.load(PSEUDO_MODEL_PATH) with open("./assets/text_files/upload_example.txt", "r") as example: TEXTE_EXEMPLE = example.read() tab_upload_content = dbc.Tab( label='Pseudonymisez un document', tab_id="tab-upload", children=html.Div(className='control-tab', children=[ html.Div([html.P("Veuillez choisir un fichier à analyser (type .odt, .doc, .docx, .txt. Max 100 Ko)"), html.P([html.B("Attention: "), "cette application n'est qu'une démo, aucune donnée n'est conservée. Veillez à ne pas transmettre d’informations sensibles."])], className='app-controls-block'), html.Div( id='seq-view-fast-upload', children=dcc.Upload(id='upload-data',
from flair.data import Sentence from flair.models import SequenceTagger texts = ['Hello, World', 'Lorem ipsum dolor sit amet'] tagger = SequenceTagger.load('./temp/best-model.pt') for text in texts: # predict NER tags sentence = Sentence(text) tagger.predict(sentence) print(f'****** {text}') spans = sentence.get_spans('ner') if not spans: print(f'No entities found') for entity in spans: print({ "start": entity.start_pos, "end": entity.end_pos, "label": entity.tag }) print('****\n')
from flair.data import Sentence from flair.models import SequenceTagger model = SequenceTagger.load('output/best-model.pt') fr = open('../data/example_recipe.txt', 'r') for line in fr: sentence = Sentence(line) model.predict(sentence) print(sentence.to_tagged_string())
def __init__( self, task_name: str, tag_dictionary: Dictionary, tag_type: str, embeddings: str = 'bert-base-uncased', num_negative_labels_to_sample: int = 2, prefix: bool = True, **tagger_args, ): """ Initializes a TextClassifier :param task_name: a string depicting the name of the task :param label_dictionary: dictionary of labels you want to predict :param embeddings: name of the pre-trained transformer model e.g., 'bert-base-uncased' etc :param num_negative_labels_to_sample: number of negative labels to sample for each positive labels against a sentence during training. Defaults to 2 negative labels for each positive label. The model would sample all the negative labels if None is passed. That slows down the training considerably. """ super(TARSTagger, self).__init__() from flair.embeddings import TransformerWordEmbeddings if not isinstance(embeddings, TransformerWordEmbeddings): embeddings = TransformerWordEmbeddings(model=embeddings, fine_tune=True, layers='-1', layer_mean=False, ) # prepare TARS dictionary tars_dictionary = Dictionary(add_unk=False) tars_dictionary.add_item('O') tars_dictionary.add_item('S-') tars_dictionary.add_item('B-') tars_dictionary.add_item('E-') tars_dictionary.add_item('I-') # initialize a bare-bones sequence tagger self.tars_model = SequenceTagger(123, embeddings, tag_dictionary=tars_dictionary, tag_type=self.static_label_type, use_crf=False, use_rnn=False, reproject_embeddings=False, **tagger_args, ) # transformer separator self.separator = str(self.tars_embeddings.tokenizer.sep_token) if self.tars_embeddings.tokenizer._bos_token: self.separator += str(self.tars_embeddings.tokenizer.bos_token) self.prefix = prefix self.num_negative_labels_to_sample = num_negative_labels_to_sample # Store task specific labels since TARS can handle multiple tasks self.add_and_switch_to_new_task(task_name, tag_dictionary, tag_type)
class TARSTagger(FewshotClassifier): """ TARS model for sequence tagging. In the backend, the model uses a BERT based 5-class sequence labeler which given a <label, text> pair predicts the probability for each word to belong to one of the BIOES classes. The input data is a usual Sentence object which is inflated by the model internally before pushing it through the transformer stack of BERT. """ static_label_type = "tars_label" def __init__( self, task_name: str, tag_dictionary: Dictionary, tag_type: str, embeddings: str = 'bert-base-uncased', num_negative_labels_to_sample: int = 2, prefix: bool = True, **tagger_args, ): """ Initializes a TextClassifier :param task_name: a string depicting the name of the task :param label_dictionary: dictionary of labels you want to predict :param embeddings: name of the pre-trained transformer model e.g., 'bert-base-uncased' etc :param num_negative_labels_to_sample: number of negative labels to sample for each positive labels against a sentence during training. Defaults to 2 negative labels for each positive label. The model would sample all the negative labels if None is passed. That slows down the training considerably. """ super(TARSTagger, self).__init__() from flair.embeddings import TransformerWordEmbeddings if not isinstance(embeddings, TransformerWordEmbeddings): embeddings = TransformerWordEmbeddings(model=embeddings, fine_tune=True, layers='-1', layer_mean=False, ) # prepare TARS dictionary tars_dictionary = Dictionary(add_unk=False) tars_dictionary.add_item('O') tars_dictionary.add_item('S-') tars_dictionary.add_item('B-') tars_dictionary.add_item('E-') tars_dictionary.add_item('I-') # initialize a bare-bones sequence tagger self.tars_model = SequenceTagger(123, embeddings, tag_dictionary=tars_dictionary, tag_type=self.static_label_type, use_crf=False, use_rnn=False, reproject_embeddings=False, **tagger_args, ) # transformer separator self.separator = str(self.tars_embeddings.tokenizer.sep_token) if self.tars_embeddings.tokenizer._bos_token: self.separator += str(self.tars_embeddings.tokenizer.bos_token) self.prefix = prefix self.num_negative_labels_to_sample = num_negative_labels_to_sample # Store task specific labels since TARS can handle multiple tasks self.add_and_switch_to_new_task(task_name, tag_dictionary, tag_type) def _get_tars_formatted_sentence(self, label, sentence): original_text = sentence.to_tokenized_string() label_text_pair = f"{label} {self.separator} {original_text}" if self.prefix \ else f"{original_text} {self.separator} {label}" label_length = 0 if not self.prefix else len(label.split(" ")) + len(self.separator.split(" ")) # make a tars sentence where all labels are O by default tars_sentence = Sentence(label_text_pair, use_tokenizer=False) for token in tars_sentence: token.add_tag(self.static_label_type, "O") # overwrite O labels with tags for token in sentence: tag = token.get_tag(self.get_current_label_type()).value if tag == "O": tars_tag = "O" elif tag == label: tars_tag = "S-" elif tag[1] == "-" and tag[2:] == label: tars_tag = tag.split('-')[0] + '-' else: tars_tag = "O" tars_sentence.get_token(token.idx + label_length).add_tag(self.static_label_type, tars_tag) return tars_sentence def _get_state_dict(self): model_state = { "state_dict": self.state_dict(), "current_task": self._current_task, "tag_type": self.get_current_label_type(), "tag_dictionary": self.get_current_label_dictionary(), "tars_model": self.tars_model, "num_negative_labels_to_sample": self.num_negative_labels_to_sample, "prefix": self.prefix, "task_specific_attributes": self._task_specific_attributes, } return model_state @staticmethod def _init_model_with_state_dict(state): print("init TARS") # init new TARS classifier model = TARSTagger( task_name=state["current_task"], tag_dictionary=state["tag_dictionary"], tag_type=state["tag_type"], embeddings=state["tars_model"].embeddings, num_negative_labels_to_sample=state["num_negative_labels_to_sample"], prefix=state["prefix"], ) # set all task information model.task_specific_attributes = state["task_specific_attributes"] # linear layers of internal classifier model.load_state_dict(state["state_dict"]) return model @property def tars_embeddings(self): return self.tars_model.embeddings def predict( self, sentences: Union[List[Sentence], Sentence], mini_batch_size=32, verbose: bool = False, label_name: Optional[str] = None, return_loss=False, embedding_storage_mode="none", ): # return """ Predict sequence tags for Named Entity Recognition task :param sentences: a Sentence or a List of Sentence :param mini_batch_size: size of the minibatch, usually bigger is more rapid but consume more memory, up to a point when it has no more effect. :param all_tag_prob: True to compute the score for each tag on each token, otherwise only the score of the best tag is returned :param verbose: set to True to display a progress bar :param return_loss: set to True to return loss :param label_name: set this to change the name of the label type that is predicted :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively. 'gpu' to store embeddings in GPU memory. """ if label_name == None: label_name = self.get_current_label_type() # with torch.no_grad(): if not sentences: return sentences if isinstance(sentences, Sentence): sentences = [sentences] # set context if not set already previous_sentence = None for sentence in sentences: if sentence.is_context_set(): continue sentence._previous_sentence = previous_sentence sentence._next_sentence = None if previous_sentence: previous_sentence._next_sentence = sentence previous_sentence = sentence # reverse sort all sequences by their length rev_order_len_index = sorted( range(len(sentences)), key=lambda k: len(sentences[k]), reverse=True ) reordered_sentences: List[Union[Sentence, str]] = [ sentences[index] for index in rev_order_len_index ] dataloader = DataLoader( dataset=SentenceDataset(reordered_sentences), batch_size=mini_batch_size ) # progress bar for verbosity if verbose: dataloader = tqdm(dataloader) overall_loss = 0 overall_count = 0 batch_no = 0 with torch.no_grad(): for batch in dataloader: batch_no += 1 if verbose: dataloader.set_description(f"Inferencing on batch {batch_no}") batch = self._filter_empty_sentences(batch) # stop if all sentences are empty if not batch: continue # go through each sentence in the batch for sentence in batch: # always remove tags first for token in sentence: token.remove_labels(label_name) all_labels = [label.decode("utf-8") for label in self.get_current_label_dictionary().idx2item] all_detected = {} for label in all_labels: tars_sentence = self._get_tars_formatted_sentence(label, sentence) label_length = 0 if not self.prefix else len(label.split(" ")) + len(self.separator.split(" ")) loss_and_count = self.tars_model.predict(tars_sentence, label_name=label_name, all_tag_prob=True, return_loss=True) overall_loss += loss_and_count[0].item() overall_count += loss_and_count[1] for span in tars_sentence.get_spans(label_name): span.set_label('tars_temp_label', label) all_detected[span] = span.score for span in tars_sentence.get_spans(label_name): for token in span: corresponding_token = sentence.get_token(token.idx - label_length) if corresponding_token is None: continue if corresponding_token.get_tag(label_name).value != '' and \ corresponding_token.get_tag(label_name).score > token.get_tag(label_name).score: continue corresponding_token.add_tag( label_name, token.get_tag(label_name).value + label, token.get_tag(label_name).score, ) # import operator # sorted_x = sorted(all_detected.items(), key=operator.itemgetter(1)) # sorted_x.reverse() # print(sorted_x) # for tuple in sorted_x: # span = tuple[0] # # tag_this = True # # for token in span: # corresponding_token = sentence.get_token(token.idx) # if corresponding_token is None: # tag_this = False # continue # if corresponding_token.get_tag(label_name).value != '' and \ # corresponding_token.get_tag(label_name).score > token.get_tag(label_name).score: # tag_this = False # continue # # if tag_this: # for token in span: # corresponding_token = sentence.get_token(token.idx) # corresponding_token.add_tag( # label_name, # token.get_tag(label_name).value + span.get_labels('tars_temp_label')[0].value, # token.get_tag(label_name).score, # ) # clearing token embeddings to save memory store_embeddings(batch, storage_mode=embedding_storage_mode) if return_loss: return overall_loss, overall_count
list_end.append(end) for i in range(len(list_mention)): dic["ner"]["extracted"].append(list_mention[i]) dic["ner"]["start_offset"].append(list_start[i]) dic["ner"]["end_offset"].append(list_end[i]) return dic def write_json(filename, dic): json_file = open(filename, 'w') json.dump(dic, json_file, ensure_ascii=False) return if __name__ == '__main__': test_path = 'input.txt' result_path = 'predict.json' model = SequenceTagger.load('output/final-model.pt') input_sentences = [] sentences = [] tags = [] file = make_wakati(test_path) for i in range(len(file)): line = file[i] input_sentence = Sentence(line) model.predict(input_sentence) input_sentences.append(input_sentence) for line in input_sentences: sentence, tag = convert_flair(line.to_tagged_string()) sentences.append(sentence) tags.append(tag) predict_dictionary = convert_json(sentences, tags)
from flair.models import SequenceTagger from flair.data import Sentence import flair, torch import sys flair.device = torch.device('cpu') classifier_model = sys.argv[1] given_sentence = sys.argv[2] classifier = SequenceTagger.load_from_file('./' + classifier_model + '/best-model.pt') sentence = Sentence(given_sentence) classifier.predict(sentence) print(sentence) #print(sentence.labels) print(sentence.to_tagged_string())
def entities_extractor(url): res = requests.get(url) html_page = res.content soup = BeautifulSoup(html_page, 'html.parser') text = soup.find_all(text=True) set([t.parent.name for t in text]) output = '' blacklist = [ '[document]', 'a', 'article', 'aside', 'body', 'button', 'clippath', 'defs', 'div', 'figcaption', 'figure', 'footer', 'form', 'g', 'h1', 'h2', 'head', 'header', 'html', 'label', 'li', 'link', 'meta', 'nav', 'noscript', 'picture', 'script', 'section', 'span', 'strong', 'style', 'svg', 'time', 'title', 'ul', # there may be more elements you don't want, such as "style", etc. ] for t in text: if t.parent.name not in blacklist: output += '{} '.format(t) sentences = [ Sentence(sent, use_tokenizer=True) for sent in split_single(output) ] tagger = SequenceTagger.load('ner') tagger.predict(sentences) li = [] for i in sentences: for entity in i.get_spans('ner'): li.append(entity.to_dict()) df = pd.DataFrame(li) df = pd.crosstab(df.text, df.type) return df
base_url = Path("") wiki_subfolder = "wiki_2019" # 1. Input sentences when using Flair. input_documents = example_preprocessing() # For Mention detection two options. # 2. Mention detection, we used the NER tagger, user can also use his/her own mention detection module. mention_detection = MentionDetection(base_url, wiki_subfolder) # If you want to use your own MD system, the required input is: {doc_name: [text, spans] ... }. mentions_dataset, n_mentions = mention_detection.format_spans(input_documents) # Alternatively use Flair NER tagger. tagger_ner = SequenceTagger.load("ner-fast") mentions_dataset, n_mentions = mention_detection.find_mentions( input_documents, tagger_ner) # 3. Load model. config = { "mode": "eval", "model_path": base_url / wiki_subfolder / "generated" / "model", } model = EntityDisambiguation(base_url, wiki_subfolder, config) # 4. Entity disambiguation. predictions, timing = model.predict(mentions_dataset) # 5. Optionally use our function to get results in a usable format. result = process_results(mentions_dataset,
def load_ner_model(self) -> SequenceTagger: model = SequenceTagger.load("flair/ner-english-ontonotes-fast") return model
labels.append(token.get_tag("ner").value) return tokens, labels def iobes2bio(iobes_labels): bio_labels = [] for label in iobes_labels: if label[0] == 'S': bio_labels.append('B' + label[1:]) elif label[0] == 'E': bio_labels.append('I' + label[1:]) else: bio_labels.append(label) return bio_labels tagger = SequenceTagger.load(os.path.join(model_folder, 'final-model.pt')) test_sentences = [x for x in corpus.test] tagger.predict(test_sentences) sentences = [] for sentence in test_sentences: tokens, labels = get_tokens_and_labels(sentence) labels = iobes2bio(labels) sentences.append((tokens, labels)) with open(os.path.join(data_folder, 'predict.bio'), 'w') as f: for tokens, labels in sentences: for token, label in zip(tokens, labels): f.write(f'{token}\t{label}\n') f.write('\n')
# -*- coding: utf-8 -*- from flair.datasets import CONLL_03 from flair.embeddings import PooledFlairEmbeddings, StackedEmbeddings, WordEmbeddings from flair.models import SequenceTagger from flair.trainers import ModelTrainer corpus = CONLL_03(base_path="data/conll-2003") embedding_types = [ WordEmbeddings("glove"), PooledFlairEmbeddings("news-forward", pooling="min"), PooledFlairEmbeddings("news-backward", pooling="min"), ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger( hidden_size=256, embeddings=embeddings, tag_dictionary=corpus.make_tag_dictionary(tag_type="ner"), tag_type="ner", ) trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train("models/checkpoints", train_with_dev=True, max_epochs=150)
def __init__(self, word_list, model='ner-ontonotes'): super(Vocabulary, self).__init__(word_list) self.compiled = None self.ner_tagger = SequenceTagger.load(model) if model else None
def __init__(self, **kwargs): super().__init__(**kwargs) self._flair_pos_tagger = SequenceTagger.load("pos-fast") self._flair_to_lemminflect_pos_map = {"NN": "NOUN", "VB": "VERB", "JJ": "ADJ"}
def load_model(self, path=None): if path is None: path = model_dir + "heb.sent" if not os.path.exists(path): raise FileNotFoundError("Cannot find sentence splitter model heb.sent at " +path) self.model = SequenceTagger.load(path)
ttl = [] ttl.append(row[3]) ttl.append(row[4]) tl.append(ttl) file.close() return tl if __name__ == "__main__": ex = "The company also showcased its latest Dynasty series of vehicles, which were recently unveiled at the company’s spring product launch in Beijing" ex = "There are a lot of cars in Los Angeles" ex = 'BYD quickly debuted it\'s E-SEED GT concept car and Song Pro SUV alongside it\'s all-new e-series models at the Shanghai International Automobile Industry Exhibition' ex = "BYD debuted its E-SEED GT concept car and Song Pro SUV alongside its all-new e-series models at the Shanghai International Automobile Industry Exhibition. The company also showcased its latest Dynasty series of vehicles, which were recently unveiled at the company’s spring product launch in Beijing. A total of 23 new car models were exhibited at the event, held at Shanghai’s National Convention and Exhibition Center, fully demonstrating the BYD New Architecture (BNA) design, the 3rd generation of Dual Mode technology, plus the e-platform framework." ex = "The Akash eagerly wanted Mehar Sharma's blue coloured jacket, green umbrella of John Sowa, and Ritwik Mishra's big black red jeans" ex = "Akash wants umbrella of Mehar" tagger = SequenceTagger.load('chunk') print(ex) # sentence = Sentence('BYD quickly debuted it\'s E-SEED GT concept car and Song Pro SUV alongside it\'s all-new e-series models at the Shanghai International Automobile Industry Exhibition .') for x in getPhrases(ex, tagger): print(x) # print(type(strchunked)) input('Enter') nlp = en_core_web_sm.load() doc = nlp( 'The company also showcased its latest Dynasty series of vehicles, which were recently unveiled at the company’s spring product launch in Beijing' ) pos_tags = [(i, i.tag_) for i in doc] print(pos_tags)
i] and pseudo_pred_labels[i] != 'O': FP += 1 else: pass return TP, FP, FN if __name__ == '__main__': parser = argparse.ArgumentParser(description="Train flair") parser.add_argument("--folder", type=str, help="folder to chkp") parser.add_argument("--method", choices=['chunk', 'word']) args = parser.parse_args() args = vars(args) tagger = SequenceTagger.load('./flair_models/' + args['folder'] + '/final-model.pt') test_data_f = "score/eng.testb.src" test_labels_f = "score/eng.testb.trg" TP, FP, FN = 0, 0, 0 with open(test_data_f, 'r') as f_data,\ open(test_labels_f, 'r') as f_labels: for sent, labels in zip(f_data, f_labels): sent = sent[:-1].split(' ') labels = labels[:-1].split(' ') true_spans = iob_to_chunk(labels) sentence = Sentence(' '.join(sent)) tagger.predict(sentence) pred_spans = sentence.get_spans('ner') pred_spans = [(r.tag, r.tokens[0].idx - 1, r.tokens[-1].idx - 1) for r in pred_spans] if args['method'] == 'chunk':
def run_experiment(config): print('Active learning strategy:', config.al.strat_name) print('Loading task...', config.data.task) preprocess = (config.model.model_type == 'crf') print(config.data.data_folder) X_train, X_test, y_train, y_test, tag_dictionary = load_task( config.data.data_folder, config.data.task, config.data.tag_column, preprocess) print('Done.') strat = strategies_to_try(config.al.strat_name) model_name = config.model.model_type for repeat in range(config.n_repeats): print( f'######################==Repeat {repeat} ==#####################') strat = strategies_to_try(config.al.strat_name) model_name = config.model.model_type if config.al.percent: percent = 0.02 print('FULL:', len(y_train)) y_seed = y_train2y_seed_percent(y_train, percent, rpt=repeat) selector = [False for _ in range(len(y_seed))] for ind, answ in enumerate(y_seed): if answ is None: selector[ind] = False elif all(e is None for e in y_seed): selector[ind] = False else: selector[ind] = True y_nonempty = np.array(y_seed)[selector] print('2PERCENT:', len(y_nonempty)) max_samples_number = int(len(y_seed) * percent) else: y_seed = y_train2y_seed(y_train, rpt=repeat) max_samples_number = config.al.max_samples_number if 'flair' in config.model.model_type: print(config.model.model_type) bayes_type = config.model.bayes_type if config.model.bayes else 'no_bayes' models_path = os.path.join( config.exp_path, f'{model_name}_{config.model.emb_name}_{bayes_type}/{config.al.strat_name}' ) os.makedirs(models_path, exist_ok=True) if os.path.exists( os.path.join(models_path, f'statistics{repeat}.json')): print(f'statistics{repeat}.json already exists. Next') continue print('Embeddings', config.model.emb_name) emb = get_embeddings(config.model.emb_name) tagger = SequenceTagger(hidden_size=config.model.hidden_size, embeddings=emb(), tag_dictionary=tag_dictionary, tag_type=config.data.task, use_crf=True) print(config.model.bayes) if config.model.bayes: print('BAYES CHOSEN') convert_to_mc_dropout( tagger, (nn.Dropout, flair.nn.WordDropout, flair.nn.LockedDropout), option='flair') active_tagger = LibActFlairBayes( tagger, base_path=models_path, reset_model_before_train=True, mini_batch_size=config.model.bs, eval_mini_batch_size=config.model.ebs, checkpoint=False, learning_rate=config.model.lr, index_subset=False, save_all_models=False, max_epochs=config.model.n_epochs, min_learning_rate=config.model.min_lr) print(active_tagger) else: active_tagger = LibActFlair( tagger, base_path=models_path, reset_model_before_train=True, mini_batch_size=config.model.bs, eval_mini_batch_size=config.model.ebs, checkpoint=False, learning_rate=config.model.lr, index_subset=False, save_all_models=False, max_epochs=config.model.n_epochs, min_learning_rate=config.model.min_lr) fit_model = False elif config.model.model_type == 'crf': models_path = os.path.join(config.exp_path, model_name) os.makedirs(models_path, exist_ok=True) if os.path.exists( os.path.join(models_path, f'statistics{repeat}.json')): print(f'statistics{repeat}.json already exists. Next') continue active_tagger = LibActCrf(algorithm="lbfgs", c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) fit_model = True elif config.model.model_type == 'transformers': if config.model.bayes: libactnn = LibActNNBayes bayes_type = config.model.bayes_type else: libactnn = LibActNN bayes_type = 'no_bayes' models_path = os.path.join( config.exp_path, f'{model_name}_{bayes_type}/{config.al.strat_name}') print(models_path) if os.path.exists( os.path.join(models_path, f'statistics{repeat}.json')): print(f'statistics{repeat}.json already exists. Next') continue index2tag = ['[PAD]'] + tag_dictionary.get_items() tag2index = {e: i for i, e in enumerate(index2tag)} active_tagger = create_libact_adaptor(tag2index, index2tag, LibActNN, config=config) fit_model = False active_learn_alg_ctor = make_libact_strategy_ctor( lambda tr_ds: strat(tr_ds, active_tagger), max_samples_number=config.al.max_samples_number) active_learner = ActiveLearner( active_learn_alg_ctor=active_learn_alg_ctor, y_dtype='str', X_full_dataset=X_train, y_full_dataset=y_seed, X_test_dataset=X_test, y_test_dataset=y_test, model_evaluate=active_tagger, eval_metrics=[f1_score], rnd_start_steps=0) statistics = emulate_active_learning( y_train, active_learner, max_iterations=config.al.n_iterations, fit_model=fit_model) dump_file(statistics, models_path, f'statistics{repeat}.json')
def delete_pattern_en(term_list): total = 0 deletes = [] lemmas_list = [] cont = 0 cont_inf = 0 cont_post = 0 for i in term_list: if (len(i) > 1): #print( i, i.split(' ') ) pos_tagger = SequenceTagger.load("flair/pos-english") i = Sentence(i) #si se cae el de lynx, probar con este https://corenlp.run/ print('esto es i') print(i) #tag=pos_tagger.tag(i.split(' ')) tag = i.get_spans('pos') print(tag) total = total + 1 joini = i list_pos = [] #spl=joini.split(' ') if (joini != ''): join_tag = '' for t in tag: if (t[1] == 'AUX'): doc = nlp(t[0]) lemlist = [tok.lemma_ for tok in doc] lem = ''.join(lemlist) lemmas_list.append(lem) if (lem == i): lem = t[0] list_pos.append('aux--' + str(lem)) if (len(spl) == 1): ind = term_list.index(str(i)) term_list[ind] = str(lem) if (t[1] == 'NOUN'): list_pos.append('noun-' + str(t[0])) if (t[1] == 'VERB'): cont_inf = cont_inf + 1 doc = nlp(t[0]) for tok in doc: l = tok.lemma_ if (l != t[0]): cont_post = cont_post + 1 lemlist = [tok.lemma_ for tok in doc] lem = ''.join(lemlist) lemmas_list.append(lem) if (lem == i): lem = t[0] list_pos.append('verb-' + str(lem)) if (len(spl) == 1): ind = term_list.index(str(i)) term_list[ind] = str(lem) if (t[1] == 'ADV'): list_pos.append('adv--' + str(t[0])) if (t[1] == 'ADJ'): list_pos.append('adj--' + str(t[0])) if (t[1] == 'SCONJ'): list_pos.append('sconj' + str(t[0])) spl_i = joini.split(' ') if (len(list_pos) == 1): pos1 = list_pos[0] if (pos1[0:4] == 'adv-'): term = pos1[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 elif (len(list_pos) == 2 and len(spl_i) == 2): pos1 = list_pos[0] pos2 = list_pos[1] term = '' if (pos1[0:4] == 'aux-' and pos2[0:4] == 'verb'): term = pos1[5:] + ' ' + pos2[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'verb' and pos2[0:4] == 'aux-'): term = pos1[5:] + ' ' + pos2[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'verb' and pos2[0:4] == 'verb'): term = pos1[5:] + ' ' + pos2[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'noun' and pos2[0:4] == 'verb'): term = pos1[5:] + ' ' + pos2[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'noun' and pos2[0:4] == 'aux-'): term = pos1[5:] + ' ' + pos2[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'adv-' and pos2[0:4] == 'adj-'): term = pos1[5:] + ' ' + pos2[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'adj-' and pos2[0:4] == 'adv-'): term = pos1[5:] + ' ' + pos2[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'adv-' and pos2[0:4] == 'aux-'): term = pos1[5:] + ' ' + pos2[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'aux-' and pos2[0:4] == 'adv-'): term = pos1[5:] + ' ' + pos2[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'adv-' and pos2[0:4] == 'verb'): term = pos1[5:] + ' ' + pos2[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'verb' and pos2[0:4] == 'aux-'): term = pos1[5:] + ' ' + pos2[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'noun' and pos2[0:4] == 'adv-'): term = pos1[5:] + ' ' + pos2[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'adv-' and pos2[0:4] == 'noun'): term = pos1[5:] + ' ' + pos2[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'verb' and pos2[0:4] == 'adv-'): term = pos1[5:] + ' ' + pos2[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'verb' and pos2[0:4] == 'noun'): term = pos1[5:] + ' ' + pos2[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'aux-' and pos2[0:4] == 'noun'): term = pos1[5:] + ' ' + pos2[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'adj-' and pos2[0:4] == 'noun'): term = pos1[5:] + ' ' + pos2[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 elif (len(list_pos) == 3 and len(spl_i) == 3): #print(list_pos, spl_i,'-', len(list_pos), len(spl_i)) pos1 = list_pos[0] pos2 = list_pos[1] pos3 = list_pos[2] term = '' if (pos1[0:4] == 'noun' and pos2[0:4] == 'verb' and pos3[0:4] == 'verb'): term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'noun' and pos2[0:4] == 'aux-' and pos3[0:4] == 'verb'): term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'noun' and pos2[0:4] == 'aux-' and pos3[0:4] == 'aux-'): term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'noun' and pos2[0:4] == 'verb' and pos3[0:4] == 'aux-'): term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'noun' and pos2[0:4] == 'verb' and pos3[0:4] == 'noun'): term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'noun' and pos2[0:4] == 'aux-' and pos3[0:4] == 'noun'): term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'verb' and pos2[0:4] == 'noun' and pos3[0:4] == 'noun'): term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'noun' and pos2[0:4] == 'noun' and pos3[0:4] == 'verb'): term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'aux-' and pos2[0:4] == 'noun' and pos3[0:4] == 'noun'): term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'noun' and pos2[0:4] == 'noun' and pos3[0:4] == 'aux-'): term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'aux-' and pos2[0:4] == 'verb' and pos3[0:4] == 'noun'): term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'noun' and pos2[0:4] == 'verb' and pos3[0:4] == 'adj-'): term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'noun' and pos2[0:4] == 'verb' and pos3[0:4] == 'noun' and joini in term_list): term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'verb' and pos2[0:4] == 'noun' and pos3[0:4] == 'adj-' and joini in term_list): term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'noun' and pos2[0:4] == 'aux-' and pos3[0:4] == 'adj-' and joini in term_list): term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'noun' and pos2[0:4] == 'adv-' and pos3[0:4] == 'adj-' and joini in term_list): term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'adj-' and pos2[0:4] == 'adv-' and pos3[0:4] == 'adj-' and joini in term_list): term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'noun' and pos2[0:4] == 'adv-' and pos3[0:4] == 'scon' and joini in term_list): term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'adj-' and pos2[0:4] == 'scon' and pos3[0:4] == 'adv-' and joini in term_list): term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'aux-' and pos2[0:4] == 'noun' and pos3[0:4] == 'adj-' and joini in term_list): term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'verb' and pos2[0:4] == 'verb' and pos3[0:4] == 'verb' and joini in term_list): term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 if (pos1[0:4] == 'adj-' and pos2[0:4] == 'noun' and pos3[0:4] == 'adj-' and joini in term_list): term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:] deletes.append(joini) ind = term_list.index(joini) #term_list.pop(ind) cont = cont + 1 for i in deletes: if (i in term_list): ind = term_list.index(i) term_list.pop(ind) #elapsed_time=time()-start_time #txt='PATRONES, DELETE'+' ('+str(cont)+') NEW LIST SIZE: ('+str(len(term_list))+') TIME: ('+str(elapsed_time)+')' joind = ', '.join(deletes) #print('PATRONES DELETE', cont, len(term_list), elapsed_time) #conts_log.information(txt, 'TERMS REMOVED: '+joind) return (term_list)
def train_sequence_labeling_model(data_folder, proposed_tags_vocabulary_size, skf_split_no): """ Trains the sequence labeling model with 2 RNN layers instead of 1. Model is trained to predict part of speech tag and takes into account information about: - text (plain text made of tokens that together form a sentence), - occurrence of separator before token, - proposed tags for given token. It is trained with use of Stacked Embeddings used to combine different embeddings together. Words are embedded using a concatenation of two vector embeddings: - Flair Embeddings - contextual string embeddings that capture latent syntactic-semantic information that goes beyond standard word embeddings. Key differences are: (1) they are trained without any explicit notion of words and thus fundamentally model words as sequences of characters. And (2) they are contextualized by their surrounding text, meaning that the same word will have different embeddings depending on its contextual use. There are forward (that goes through the given on input plain text form left to right) and backward model (that goes through the given on input plain text form right to left) used for part of speech (pos) tag training. - One Hot Embeddings - embeddings that encode each word in a vocabulary as a one-hot vector, followed by an embedding layer. These embeddings thus do not encode any prior knowledge as do most other embeddings. They also differ in that they require to see a Corpus during instantiation, so they can build up a vocabulary consisting of the most common words seen in the corpus, plus an UNK token for all rare words. There are two One Hot Embeddings used in training: - first to embed information about occurrence of separator before token, - second to embed information about concatenated with a ';' proposed tags. Model training is based on stratified 10 fold cross validation split indicated by skf_split_no argument. Model and training logs are saved in resources_ex_4/taggers/example-pos/it-<skf_split_no> directory (where <skf_split_no> is the number of stratified 10 fold cross validation split used to train the model). This is the method where internal states of forward and backward Flair models are taken at the end of each token and, supplemented by information about occurrence of separator before token and proposed tags for given token used to train model for one of stratified 10 fold cross validation splits. Additionally method logs other training log files and saves them in the resources_ex_4 directory of this project under the name training_ex_4_<skf_plit_no>.log :param data_folder: folder where files with column corpus split are stored. Those columns are used to initialize ColumnCorpus object :param proposed_tags_vocabulary_size: number of proposed tags :param skf_split_no: number that indicates one of stratified 10 fold cross validation splits (from range 1 to 10) used to train the model """ # define columns columns = {0: 'text', 1: 'pos', 2: 'is_separator', 3: 'proposed_tags'} # init a corpus using column format, data folder and the names of the train and test files # 1. get the corpus corpus: Corpus = ColumnCorpus(data_folder, columns, train_file='train_' + str(skf_split_no), test_file='test_' + str(skf_split_no), dev_file=None) log.info(corpus) # 2. what tag do we want to predict tag_type = 'pos' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) log.info(tag_dictionary) # 4. initialize embeddings embedding_types: List[TokenEmbeddings] = [ FlairEmbeddings('pl-forward', chars_per_chunk=64), FlairEmbeddings('pl-backward', chars_per_chunk=64), OneHotEmbeddings(corpus=corpus, field='is_separator', embedding_length=3, min_freq=3), OneHotEmbeddings(corpus=corpus, field='proposed_tags', embedding_length=math.ceil( (proposed_tags_vocabulary_size + 1)**0.25), min_freq=3) ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) # 5. initialize sequence tagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=False, rnn_layers=2) # 6. initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) # 7. start training trainer.train( use_scratch_dir_if_available('resources_ex_4/taggers/example-pos/it-' + str(skf_split_no)), learning_rate=0.1, mini_batch_size=32, embeddings_storage_mode='gpu', max_epochs=sys.maxsize, monitor_test=True) # 8. plot weight traces (optional) plotter = Plotter() plotter.plot_weights( use_scratch_dir_if_available('resources_ex_4/taggers/example-pos/it-' + str(skf_split_no) + '/weights.txt'))
from flair.data import Sentence from flair.models import SequenceTagger # convert text file into String with open( r'/home/pia/Uni/5.Semester/Textmining/Satz-Reduktion/satz-reduktion/Datensatz/Saetze_clean_flair.txt', "r") as myfile: data = myfile.read().replace('\n', ' ') # make a sentence sentence = Sentence(data, use_tokenizer=True) #load the NER tagger tagger = SequenceTagger.load('de-ner') #run NER over sentence tagger.predict(sentence) #save tagged sentence into a String tagged = sentence.to_tagged_string() # save String into text file file = open( '/home/pia/Uni/5.Semester/Textmining/Satz-Reduktion/satz-reduktion/Datensatz/saetze_clean_getagged_flair.txt', 'w') file.writelines(tagged) print(sentence) print('The following NER tags are found:') #iterate over entities and print spans
corpus: Corpus = ColumnCorpus(data_folder, columns, train_file='train.txt', test_file='test.txt', dev_file='valid.txt') print(corpus) tag_type = 'ner' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary) if ARGS.restore: ensemble_tagger = EnsembleTagger.load(model_path + "final-model.pt") else: elmo_tagger = SequenceTagger(hidden_size=256, embeddings=ELMoEmbeddings('small'), tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) bert_tagger = SequenceTagger(hidden_size=256, embeddings=BertEmbeddings(), tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) xlnet_tagger = SequenceTagger(hidden_size=256, embeddings=XLNetEmbeddings(), tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) flair_tagger = SequenceTagger(hidden_size=256, embeddings=StackedEmbeddings([ FlairEmbeddings('news-forward'),
from flair.data import Sentence from flair.models import SequenceTagger # make a sentence sentence = Sentence('I love Berlin .') # load the NER tagger tagger = SequenceTagger.load('ner') # run NER over sentence tagger.predict(sentence)
for ppn in ppnDirs: for dirpath, dirnames, files in os.walk(sbbGetBasePath + ppn): for name in files: if dirpath.endswith("_FULLTEXT"): # if we found a fulltext directory, only add XML files, i.e., the ALTO candidate files if name.endswith(".xml") or name.endswith(".XML"): fulltextFilePaths.append( os.path.join(dirpath, name)) dirsPerPPN[ppn].append(os.path.join(dirpath, name)) totalFiles = len(fulltextFilePaths) printLog("Found %i ALTO candidate files for further processing." % totalFiles) if useFlairNLP: nerModel = SequenceTagger.load(flairModel) print("Flair model loaded.") processCounter = 0 for ppn in dirsPerPPN: textPerPPN = "" nerTextPerPPN = "" nerDicts = [] print("Processing PPN: " + ppn) for file in dirsPerPPN[ppn]: processCounter += 1 print( "\tProcessing file %i of %i (total files over all PPNs)" % (processCounter, totalFiles)) r = parseALTO(file)