def annotate(output_format, dataset_type='conll2003', lang='en', architecture='BidLSTM_CRF', use_ELMo=False, use_BERT=False, file_in=None, file_out=None): if file_in is None or not os.path.isfile(file_in): raise ValueError("the provided input file is not valid") annotations = [] if (dataset_type == 'conll2003') and (lang == 'en'): # load model model_name = 'ner-en-conll2003' if use_ELMo: model_name += '-with_ELMo' elif use_BERT: model_name += '-with_BERT' model_name += '-' + architecture model = Sequence(model_name) model.load() elif (dataset_type == 'conll2012') and (lang == 'en'): # load model model_name = 'ner-en-conll2012' if use_ELMo: model_name += '-with_ELMo' elif use_BERT: model_name += '-with_BERT' model_name += '-' + architecture model = Sequence(model_name) model.load() elif (lang == 'fr'): model_name = 'ner-fr-lemonde' if use_ELMo: model_name += '-with_ELMo' elif use_BERT: model_name += '-with_BERT' model_name += '-' + architecture model = Sequence(model_name) model.load() else: print("dataset/language combination is not supported:", dataset_type, lang) return start_time = time.time() model.tag_file(file_in=file_in, output_format=output_format, file_out=file_out) runtime = round(time.time() - start_time, 3) print("runtime: %s seconds " % (runtime))
def eval(dataset_type='conll2003', lang='en', architecture='BidLSTM_CRF', use_ELMo=False, use_BERT=False, data_path=None): if (dataset_type == 'conll2003') and (lang == 'en'): print('Loading CoNLL-2003 NER data...') x_test, y_test = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2003/eng.testb') stats(x_eval=x_test, y_eval=y_test) # load model model_name = 'ner-en-conll2003' if use_ELMo: model_name += '-with_ELMo' elif use_BERT: model_name += '-with_BERT' model_name += '-' + architecture model = Sequence(model_name) model.load() elif (dataset_type == 'conll2012') and (lang == 'en'): print('Loading Ontonotes 5.0 CoNLL-2012 NER data...') x_test, y_test = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2012-NER/eng.test') stats(x_eval=x_test, y_eval=y_test) # load model model_name = 'ner-en-conll2012' if use_ELMo: model_name += '-with_ELMo' elif use_BERT: model_name += '-with_BERT' model_name += '-' + architecture model = Sequence(model_name) model.load() else: print("dataset/language combination is not supported for fixed eval:", dataset_type, lang) return start_time = time.time() print("\nEvaluation on test set:") model.eval(x_test, y_test) runtime = round(time.time() - start_time, 3) print("runtime: %s seconds " % (runtime))
def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None, use_ELMo=False): annotations = [] model_name = 'insult-' + architecture if use_ELMo: model_name += '-with_ELMo' # load model model = Sequence(model_name, architecture=architecture, transformer_name=transformer, use_ELMo=use_ELMo) model.load() start_time = time.time() annotations = model.tag(texts, output_format) runtime = round(time.time() - start_time, 3) if output_format == 'json': annotations["runtime"] = runtime else: print("runtime: %s seconds " % (runtime)) return annotations
def eval_(model, use_ELMo=False, input_path=None): print('Loading data...') if input_path is None: # it should never be the case print("A Grobid evaluation data file must be specified to evaluate a grobid model for the eval action") else: x_all, y_all, f_all = load_data_and_labels_crf_file(input_path) print(len(x_all), 'evaluation sequences') model_name = 'grobid-' + model if use_ELMo: model_name += '-with_ELMo' start_time = time.time() # load the model model = Sequence(model_name) model.load() # evaluation print("\nEvaluation:") model.eval(x_all, y_all, features=f_all) runtime = round(time.time() - start_time, 3) print("Evaluation runtime: %s seconds " % (runtime))
def annotate_text(texts, model, output_format, use_ELMo=False, architecture='BidLSTM_CRF', features=None): annotations = [] # load model model_name = 'grobid-' + model model_name += '-' + architecture if use_ELMo and not 'bert' in model.lower(): model_name += '-with_ELMo' model = Sequence(model_name) model.load() start_time = time.time() annotations = model.tag(texts, output_format, features=features) runtime = round(time.time() - start_time, 3) if output_format is 'json': annotations["runtime"] = runtime else: print("runtime: %s seconds " % (runtime)) return annotations
def eval_(model, use_ELMo=False, input_path=None, architecture='BidLSTM_CRF'): print('Loading data...') if input_path is None: # it should never be the case print( "A Grobid evaluation data file must be specified for evaluating a grobid model for the eval action, use parameter --input " ) else: x_all, y_all, f_all = load_data_and_labels_crf_file(input_path) print(len(x_all), 'evaluation sequences') model_name = 'grobid-' + model model_name += '-' + architecture if use_ELMo and not 'bert' in model.lower(): model_name += '-with_ELMo' start_time = time.time() # load the model model = Sequence(model_name) model.load() # evaluation print("\nEvaluation:") model.eval(x_all, y_all, features=f_all) runtime = round(time.time() - start_time, 3) print("Evaluation runtime: %s seconds " % (runtime))
def train_eval(model, embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, input_path=None, output_path=None, fold_count=1, features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, use_ELMo=False): print('Loading data...') if input_path is None: x_all, y_all, f_all = load_data_and_labels_crf_file('data/sequenceLabelling/grobid/'+model+'/'+model+'-060518.train') else: x_all, y_all, f_all = load_data_and_labels_crf_file(input_path) x_train_all, x_eval, y_train_all, y_eval, f_train_all, f_eval = train_test_split(x_all, y_all, f_all, test_size=0.1, shuffle=True) x_train, x_valid, y_train, y_valid, f_train, f_valid = train_test_split(x_train_all, y_train_all, f_train_all, test_size=0.1) print(len(x_train), 'train sequences') print(len(x_valid), 'validation sequences') print(len(x_eval), 'evaluation sequences') batch_size, max_sequence_length, model_name, embeddings_name, max_epoch, multiprocessing, early_stop = configure(model, architecture, output_path, max_sequence_length, batch_size, embeddings_name, max_epoch, use_ELMo) model = Sequence(model_name, recurrent_dropout=0.50, embeddings_name=embeddings_name, architecture=architecture, transformer_name=transformer, max_sequence_length=max_sequence_length, batch_size=batch_size, fold_number=fold_count, features_indices=features_indices, max_epoch=max_epoch, use_ELMo=use_ELMo, multiprocessing=multiprocessing, early_stop=early_stop) start_time = time.time() if fold_count == 1: model.train(x_train, y_train, f_train=f_train, x_valid=x_valid, y_valid=y_valid, f_valid=f_valid) else: model.train_nfold(x_train, y_train, f_train=f_train, x_valid=x_valid, y_valid=y_valid, f_valid=f_valid) runtime = round(time.time() - start_time, 3) print("training runtime: %s seconds " % runtime) # evaluation print("\nEvaluation:") model.eval(x_eval, y_eval, features=f_eval) # saving the model (must be called after eval for multiple fold training) if output_path: model.save(output_path) else: model.save()
def __init__(self): from delft.sequenceLabelling import Sequence from delft.sequenceLabelling.models import BidLSTM_CRF self.model = Sequence("material", BidLSTM_CRF.name) self.model.load(dir_path="./models") self.mp = MaterialParser(pubchem_lookup=False, verbose=True) self.regex_separators = re.compile(r',|;|or|and')
def __init__(self, model_path=None): from delft.sequenceLabelling import Sequence from delft.sequenceLabelling.models import BidLSTM_CRF self.model = Sequence("materialNER_fastText_oS+Sm-BidLSTM_CRF", BidLSTM_CRF.name) if model_path and os.path.exists(model_path): self.model.load(dir_path=model_path) else: self.model.load(dir_path="./models")
def train_eval(model, embeddings_name, architecture='BidLSTM_CRF', use_ELMo=False, input_path=None, output_path=None, fold_count=1, features_indices=None): print('Loading data...') if input_path is None: x_all, y_all, f_all = load_data_and_labels_crf_file('data/sequenceLabelling/grobid/'+model+'/'+model+'-060518.train') else: x_all, y_all, f_all = load_data_and_labels_crf_file(input_path) x_train_all, x_eval, y_train_all, y_eval, f_train_all, f_eval = train_test_split(x_all, y_all, f_all, test_size=0.1) x_train, x_valid, y_train, y_valid, f_train, f_valid = train_test_split(x_train_all, y_train_all, f_train_all, test_size=0.1) print(len(x_train), 'train sequences') print(len(x_valid), 'validation sequences') print(len(x_eval), 'evaluation sequences') batch_size, max_sequence_length, model_name = configure(model, architecture, output_path, use_ELMo) model = Sequence(model_name, max_epoch=100, recurrent_dropout=0.50, embeddings_name=embeddings_name, model_type=architecture, use_ELMo=use_ELMo, max_sequence_length=max_sequence_length, batch_size=batch_size, fold_number=fold_count, features_indices=features_indices) start_time = time.time() if fold_count == 1: model.train(x_train, y_train, f_train=f_train, x_valid=x_valid, y_valid=y_valid, f_valid=f_valid) else: model.train_nfold(x_train, y_train, f_train=f_train, x_valid=x_valid, y_valid=y_valid, f_valid=f_valid, fold_number=fold_count) runtime = round(time.time() - start_time, 3) print("training runtime: %s seconds " % runtime) # evaluation print("\nEvaluation:") model.eval(x_eval, y_eval, features=f_eval) # saving the model if (output_path): model.save(output_path) else: model.save()
def train(model, embeddings_name=None, architecture=None, transformer=None, input_path=None, output_path=None, features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, use_ELMo=False): print('Loading data...') if input_path == None: x_all, y_all, f_all = load_data_and_labels_crf_file('data/sequenceLabelling/grobid/'+model+'/'+model+'-060518.train') else: x_all, y_all, f_all = load_data_and_labels_crf_file(input_path) print(len(x_all), 'total sequences') x_train, x_valid, y_train, y_valid, f_train, f_valid = train_test_split(x_all, y_all, f_all, test_size=0.1, shuffle=True) print(len(x_train), 'train sequences') print(len(x_valid), 'validation sequences') batch_size, max_sequence_length, model_name, embeddings_name, max_epoch, multiprocessing, early_stop = configure(model, architecture, output_path, max_sequence_length, batch_size, embeddings_name, max_epoch, use_ELMo) model = Sequence(model_name, recurrent_dropout=0.50, embeddings_name=embeddings_name, architecture=architecture, transformer_name=transformer, batch_size=batch_size, max_sequence_length=max_sequence_length, features_indices=features_indices, max_epoch=max_epoch, use_ELMo=use_ELMo, multiprocessing=multiprocessing, early_stop=early_stop) start_time = time.time() model.train(x_train, y_train, f_train, x_valid, y_valid, f_valid) runtime = round(time.time() - start_time, 3) print("training runtime: %s seconds " % (runtime)) # saving the model if output_path: model.save(output_path) else: model.save()
def annotate(texts, output_format, architecture='BidLSTM_CRF'): annotations = [] # load model model = Sequence('insult', architecture=architecture) model.load() start_time = time.time() annotations = model.tag(texts, output_format) runtime = round(time.time() - start_time, 3) if output_format is 'json': annotations["runtime"] = runtime else: print("runtime: %s seconds " % (runtime)) return annotations
def train(model, embeddings_name, architecture='BidLSTM_CRF', use_ELMo=False, input_path=None, output_path=None): print('Loading data...') if input_path is None: x_all, y_all, f_all = load_data_and_labels_crf_file( 'data/sequenceLabelling/grobid/' + model + '/' + model + '-060518.train') else: x_all, y_all, f_all = load_data_and_labels_crf_file(input_path) x_train, x_valid, y_train, y_valid = train_test_split(x_all, y_all, test_size=0.1) print(len(x_train), 'train sequences') print(len(x_valid), 'validation sequences') if output_path: model_name = model else: model_name = 'grobid-' + model if use_ELMo: model_name += '-with_ELMo' model = Sequence(model_name, max_epoch=100, recurrent_dropout=0.50, embeddings_name=embeddings_name, model_type=architecture, use_ELMo=use_ELMo) start_time = time.time() model.train(x_train, y_train, x_valid, y_valid) runtime = round(time.time() - start_time, 3) print("training runtime: %s seconds " % (runtime)) # saving the model if (output_path): model.save(output_path) else: model.save()
def train(embeddings_name, architecture='BidLSTM_CRF'): root = os.path.join(os.path.dirname(__file__), 'data/sequenceLabelling/toxic/') train_path = os.path.join(root, 'corrected.xml') valid_path = os.path.join(root, 'valid.xml') print('Loading data...') x_train, y_train = load_data_and_labels_xml_file(train_path) x_valid, y_valid = load_data_and_labels_xml_file(valid_path) print(len(x_train), 'train sequences') print(len(x_valid), 'validation sequences') model = Sequence('insult', max_epoch=50, embeddings_name=embeddings_name) model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid) print('training done') # saving the model model.save()
def annotate_text(texts, model, output_format, use_ELMo=False): annotations = [] # load model model_name = 'grobid-'+model if use_ELMo: model_name += '-with_ELMo' model = Sequence(model_name) model.load() start_time = time.time() annotations = model.tag(texts, output_format) runtime = round(time.time() - start_time, 3) if output_format is 'json': annotations["runtime"] = runtime else: print("runtime: %s seconds " % (runtime)) return annotations
def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, use_ELMo=False): batch_size, maxlen, patience, early_stop, max_epoch, embeddings_name = configure( architecture, embeddings_name) root = 'data/sequenceLabelling/toxic/' train_path = os.path.join(root, 'corrected.xml') valid_path = os.path.join(root, 'valid.xml') print('Loading data...') x_train, y_train = load_data_and_labels_xml_file(train_path) x_valid, y_valid = load_data_and_labels_xml_file(valid_path) print(len(x_train), 'train sequences') print(len(x_valid), 'validation sequences') model_name = 'insult-' + architecture if use_ELMo: model_name += '-with_ELMo' model = Sequence(model_name, max_epoch=max_epoch, batch_size=batch_size, max_sequence_length=maxlen, embeddings_name=embeddings_name, architecture=architecture, patience=patience, early_stop=early_stop, transformer_name=transformer, use_ELMo=use_ELMo) model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid) print('training done') # saving the model (must be called after eval for multiple fold training) model.save()
def train(dataset_type='conll2003', lang='en', embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, data_path=None, use_ELMo=False): batch_size, max_sequence_length, patience, recurrent_dropout, early_stop, max_epoch, embeddings_name, word_lstm_units, multiprocessing = \ configure(architecture, dataset_type, lang, embeddings_name, use_ELMo) if (dataset_type == 'conll2003') and (lang == 'en'): print('Loading data...') x_train1, y_train1 = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2003/eng.train') x_train2, y_train2 = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2003/eng.testa') x_train3, y_train3 = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2003/eng.testb') # we concatenate all sets x_all = np.concatenate((x_train1, x_train2, x_train3), axis=0) y_all = np.concatenate((y_train1, y_train2, y_train3), axis=0) # split train and valid sets in a random way x_train, x_valid, y_train, y_valid = train_test_split(x_all, y_all, test_size=0.1) stats(x_train, y_train, x_valid, y_valid) model_name = 'ner-en-conll2003-' + architecture if use_ELMo: model_name += '-with_ELMo' model = Sequence(model_name, max_epoch=max_epoch, recurrent_dropout=recurrent_dropout, embeddings_name=embeddings_name, architecture=architecture, transformer_name=transformer, word_lstm_units=word_lstm_units, batch_size=batch_size, early_stop=early_stop, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, multiprocessing=multiprocessing) elif (dataset_type == 'conll2012') and (lang == 'en'): print('Loading Ontonotes 5.0 CoNLL-2012 NER data...') x_train1, y_train1 = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2012-NER/eng.train') x_train2, y_train2 = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2012-NER/eng.dev') x_train3, y_train3 = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2012-NER/eng.test') # we concatenate train and valid sets x_all = np.concatenate((x_train1, x_train2, x_train3), axis=0) y_all = np.concatenate((y_train1, y_train2, y_train3), axis=0) # split train and valid sets in a random way x_train, x_valid, y_train, y_valid = train_test_split(x_all, y_all, test_size=0.1) stats(x_train, y_train, x_valid, y_valid) model_name = 'ner-en-conll2012-' + architecture if use_ELMo: model_name += '-with_ELMo' model = Sequence(model_name, max_epoch=max_epoch, recurrent_dropout=recurrent_dropout, embeddings_name=embeddings_name, architecture=architecture, transformer_name=transformer, word_lstm_units=word_lstm_units, batch_size=batch_size, early_stop=early_stop, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, multiprocessing=multiprocessing) elif (lang == 'fr'): print('Loading data...') dataset_type = 'lemonde' x_all, y_all = load_data_and_labels_lemonde( 'data/sequenceLabelling/leMonde/ftb6_ALL.EN.docs.relinked.xml') shuffle_arrays([x_all, y_all]) x_train, x_valid, y_train, y_valid = train_test_split(x_all, y_all, test_size=0.1) stats(x_train, y_train, x_valid, y_valid) model_name = 'ner-fr-lemonde-' + architecture if use_ELMo: model_name += '-with_ELMo' model = Sequence(model_name, max_epoch=max_epoch, recurrent_dropout=recurrent_dropout, embeddings_name=embeddings_name, architecture=architecture, transformer_name=transformer, word_lstm_units=word_lstm_units, batch_size=batch_size, early_stop=early_stop, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, multiprocessing=multiprocessing) else: print("dataset/language combination is not supported:", dataset_type, lang) return #elif (dataset_type == 'ontonotes') and (lang == 'en'): # model = sequenceLabelling.Sequence('ner-en-ontonotes', max_epoch=60, embeddings_name=embeddings_name) #elif (lang == 'fr'): # model = sequenceLabelling.Sequence('ner-fr-lemonde', max_epoch=60, embeddings_name=embeddings_name) start_time = time.time() model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid) runtime = round(time.time() - start_time, 3) print("training runtime: %s seconds " % (runtime)) # saving the model model.save()
def train_eval(embeddings_name=None, dataset_type='conll2003', lang='en', architecture='BidLSTM_CRF', transformer=None, fold_count=1, train_with_validation_set=False, data_path=None, use_ELMo=False): batch_size, max_sequence_length, patience, recurrent_dropout, early_stop, max_epoch, embeddings_name, word_lstm_units, multiprocessing = \ configure(architecture, dataset_type, lang, embeddings_name, use_ELMo) if (dataset_type == 'conll2003') and (lang == 'en'): print('Loading CoNLL 2003 data...') x_train, y_train = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2003/eng.train') x_valid, y_valid = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2003/eng.testa') x_eval, y_eval = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2003/eng.testb') stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval) model_name = 'ner-en-conll2003-' + architecture if use_ELMo: model_name += '-with_ELMo' if not train_with_validation_set: # restrict training on train set, use validation set for early stop, as in most papers model = Sequence(model_name, max_epoch=max_epoch, recurrent_dropout=recurrent_dropout, embeddings_name=embeddings_name, fold_number=fold_count, architecture=architecture, transformer_name=transformer, word_lstm_units=word_lstm_units, batch_size=batch_size, early_stop=True, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, multiprocessing=multiprocessing) else: # also use validation set to train (no early stop, hyperparmeters must be set preliminarly), # as (Chui & Nochols, 2016) and (Peters and al., 2017) # this leads obviously to much higher results (~ +0.5 f1 score with CoNLL-2003) model = Sequence(model_name, max_epoch=max_epoch, recurrent_dropout=recurrent_dropout, embeddings_name=embeddings_name, early_stop=False, fold_number=fold_count, architecture=architecture, transformer_name=transformer, word_lstm_units=word_lstm_units, batch_size=batch_size, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, multiprocessing=multiprocessing) elif (dataset_type == 'ontonotes-all') and (lang == 'en'): print( "Loading all Ontonotes 5.0 XML data, evaluation will be on 10\% random partition" ) x_all, y_all = load_data_and_labels_ontonotes(data_path) x_train_all, x_eval, y_train_all, y_eval = train_test_split( x_all, y_all, test_size=0.1) x_train, x_valid, y_train, y_valid = train_test_split(x_train_all, y_train_all, test_size=0.1) stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval) model_name = 'ner-en-ontonotes-' + architecture if use_ELMo: model_name += '-with_ELMo' model = Sequence(model_name, max_epoch=max_epoch, recurrent_dropout=recurrent_dropout, embeddings_name=embeddings_name, fold_number=fold_count, architecture=architecture, transformer_name=transformer, word_lstm_units=word_lstm_units, batch_size=batch_size, early_stop=early_stop, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, multiprocessing=multiprocessing) elif (dataset_type == 'conll2012') and (lang == 'en'): print('Loading Ontonotes 5.0 CoNLL-2012 NER data...') x_train, y_train = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2012-NER/eng.train') x_valid, y_valid = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2012-NER/eng.dev') x_eval, y_eval = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2012-NER/eng.test') stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval) model_name = 'ner-en-conll2012-' + architecture if use_ELMo: model_name += '-with_ELMo' if not train_with_validation_set: model = Sequence(model_name, max_epoch=max_epoch, recurrent_dropout=recurrent_dropout, embeddings_name=embeddings_name, fold_number=fold_count, architecture=architecture, transformer_name=transformer, word_lstm_units=word_lstm_units, batch_size=batch_size, early_stop=True, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, multiprocessing=multiprocessing) else: # also use validation set to train (no early stop, hyperparameters must be set preliminarly), # as (Chui & Nochols, 2016) and (Peters and al., 2017) # this leads obviously to much higher results model = Sequence(model_name, max_epoch=max_epoch, recurrent_dropout=recurrent_dropout, embeddings_name=embeddings_name, early_stop=False, fold_number=fold_count, architecture=architecture, transformer_name=transformer, word_lstm_units=word_lstm_units, batch_size=batch_size, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, multiprocessing=multiprocessing) elif (lang == 'fr') and (dataset_type == 'ftb' or dataset_type is None): print('Loading data for ftb...') x_all, y_all = load_data_and_labels_lemonde( 'data/sequenceLabelling/leMonde/ftb6_ALL.EN.docs.relinked.xml') shuffle_arrays([x_all, y_all]) x_train_all, x_eval, y_train_all, y_eval = train_test_split( x_all, y_all, test_size=0.1) x_train, x_valid, y_train, y_valid = train_test_split(x_train_all, y_train_all, test_size=0.1) stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval) model_name = 'ner-fr-lemonde-' + architecture if use_ELMo: model_name += '-with_ELMo' model = Sequence(model_name, max_epoch=max_epoch, recurrent_dropout=recurrent_dropout, embeddings_name=embeddings_name, fold_number=fold_count, architecture=architecture, transformer_name=transformer, word_lstm_units=word_lstm_units, batch_size=batch_size, early_stop=early_stop, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, multiprocessing=multiprocessing) elif (lang == 'fr') and (dataset_type == 'ftb_force_split'): print('Loading data for ftb_force_split...') x_train, y_train = load_data_and_labels_conll( 'data/sequenceLabelling/leMonde/ftb6_train.conll') shuffle_arrays([x_train, y_train]) x_valid, y_valid = load_data_and_labels_conll( 'data/sequenceLabelling/leMonde/ftb6_dev.conll') x_eval, y_eval = load_data_and_labels_conll( 'data/sequenceLabelling/leMonde/ftb6_test.conll') stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval) model_name = 'ner-fr-lemonde-force-split-' + architecture if use_ELMo: model_name += '-with_ELMo' if not train_with_validation_set: # restrict training on train set, use validation set for early stop, as in most papers model = Sequence(model_name, max_epoch=max_epoch, recurrent_dropout=recurrent_dropout, embeddings_name=embeddings_name, early_stop=True, fold_number=fold_count, architecture=architecture, transformer_name=transformer, word_lstm_units=word_lstm_units, batch_size=batch_size, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, multiprocessing=multiprocessing) else: # also use validation set to train (no early stop, hyperparmeters must be set preliminarly), # as (Chui & Nochols, 2016) and (Peters and al., 2017) # this leads obviously to much higher results (~ +0.5 f1 score with CoNLL-2003) model = Sequence(model_name, max_epoch=max_epoch, recurrent_dropout=recurrent_dropout, embeddings_name=embeddings_name, early_stop=False, fold_number=fold_count, architecture=architecture, transformer_name=transformer, word_lstm_units=word_lstm_units, batch_size=batch_size, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, multiprocessing=multiprocessing) elif (lang == 'fr') and (dataset_type == 'ftb_force_split_xml'): print('Loading data for ftb_force_split_xml...') x_train, y_train = load_data_and_labels_lemonde( 'data/sequenceLabelling/leMonde/ftb6_ALL.EN.docs.relinked.train.xml' ) shuffle_arrays([x_train, y_train]) x_valid, y_valid = load_data_and_labels_lemonde( 'data/sequenceLabelling/leMonde/ftb6_ALL.EN.docs.relinked.dev.xml') x_eval, y_eval = load_data_and_labels_lemonde( 'data/sequenceLabelling/leMonde/ftb6_ALL.EN.docs.relinked.test.xml' ) stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval) model_name = 'ner-fr-lemonde-force-split-xml-' + architecture if use_ELMo: model_name += '-with_ELMo' if not train_with_validation_set: # restrict training on train set, use validation set for early stop, as in most papers model = Sequence(model_name, max_epoch=max_epoch, recurrent_dropout=recurrent_dropout, embeddings_name=embeddings_name, early_stop=True, fold_number=fold_count, architecture=architecture, transformer_name=transformer, word_lstm_units=word_lstm_units, batch_size=batch_size, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, multiprocessing=multiprocessing) else: # also use validation set to train (no early stop, hyperparmeters must be set preliminarly), # as (Chui & Nochols, 2016) and (Peters and al., 2017) # this leads obviously to much higher results (~ +0.5 f1 score with CoNLL-2003) model = Sequence(model_name, max_epoch=max_epoch, recurrent_dropout=recurrent_dropout, embeddings_name=embeddings_name, early_stop=False, fold_number=fold_count, architecture=architecture, transformer_name=transformer, word_lstm_units=word_lstm_units, batch_size=batch_size, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, multiprocessing=multiprocessing) else: print("dataset/language combination is not supported:", dataset_type, lang) return start_time = time.time() if fold_count == 1: model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid) else: model.train_nfold(x_train, y_train, x_valid=x_valid, y_valid=y_valid) runtime = round(time.time() - start_time, 3) print("training runtime: %s seconds " % (runtime)) print("\nEvaluation on test set:") model.eval(x_eval, y_eval) # # saving the model (must be called after eval for multiple fold training) model.save()
def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, input_path=None, output_path=None, fold_count=1, features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, use_ELMo=False): print('Loading data...') if input_path is None: x_all1 = y_all1 = x_all2 = y_all2 = x_all3 = y_all3 = [] dataseer_sentences_path = "data/sequenceLabelling/datasets/dataseer_sentences.json" if os.path.exists(dataseer_sentences_path): x_all1, y_all1 = load_data_and_labels_json_offsets( dataseer_sentences_path) ner_dataset_recognition_sentences_path = "data/sequenceLabelling/datasets/ner_dataset_recognition_sentences.json" if os.path.exists(ner_dataset_recognition_sentences_path): x_all2, y_all2 = load_data_and_labels_json_offsets( ner_dataset_recognition_sentences_path) coleridge_sentences_path = "data/sequenceLabelling/datasets/coleridge_sentences.json.gz" if os.path.exists(coleridge_sentences_path): x_all3, y_all3 = load_data_and_labels_json_offsets( coleridge_sentences_path) x_all = np.concatenate((x_all1, x_all2, x_all3[:1000]), axis=0) y_all = np.concatenate((y_all1, y_all2, y_all3[:1000]), axis=0) else: x_all, y_all = load_data_and_labels_json_offsets(input_path) x_train, x_valid, y_train, y_valid = train_test_split(x_all, y_all, test_size=0.1, shuffle=True) print(len(x_train), 'train sequences') print(len(x_valid), 'validation sequences') batch_size, max_sequence_length, model_name, embeddings_name, max_epoch, multiprocessing, early_stop = configure( architecture, output_path, max_sequence_length, batch_size, embeddings_name, max_epoch, use_ELMo) model = Sequence(model_name, recurrent_dropout=0.50, embeddings_name=embeddings_name, architecture=architecture, transformer_name=transformer, max_sequence_length=max_sequence_length, batch_size=batch_size, fold_number=fold_count, features_indices=features_indices, max_epoch=max_epoch, use_ELMo=use_ELMo, multiprocessing=multiprocessing, early_stop=early_stop) start_time = time.time() model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid) runtime = round(time.time() - start_time, 3) print("training runtime: %s seconds " % runtime) # saving the model if output_path: model.save(output_path) else: model.save()
def train_eval(embedding_name, dataset_type='conll2003', lang='en', architecture='BidLSTM_CRF', fold_count=1, train_with_validation_set=False, use_ELMo=False, use_BERT=False, data_path=None): if (architecture == "BidLSTM_CNN_CRF"): word_lstm_units = 200 max_epoch = 30 recurrent_dropout = 0.5 else: word_lstm_units = 100 max_epoch = 25 recurrent_dropout = 0.5 if use_ELMo or use_BERT: batch_size = 120 else: batch_size = 20 if (dataset_type == 'conll2003') and (lang == 'en'): print('Loading CoNLL 2003 data...') x_train, y_train = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2003/eng.train') x_valid, y_valid = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2003/eng.testa') x_eval, y_eval = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2003/eng.testb') stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval) model_name = 'ner-en-conll2003' if use_ELMo: model_name += '-with_ELMo' elif use_BERT: model_name += '-with_BERT' model_name += '-' + architecture if not train_with_validation_set: # restrict training on train set, use validation set for early stop, as in most papers model = Sequence(model_name, max_epoch=60, recurrent_dropout=recurrent_dropout, embeddings_name=embedding_name, early_stop=True, fold_number=fold_count, model_type=architecture, word_lstm_units=word_lstm_units, batch_size=batch_size, use_ELMo=use_ELMo, use_BERT=use_BERT) else: # also use validation set to train (no early stop, hyperparmeters must be set preliminarly), # as (Chui & Nochols, 2016) and (Peters and al., 2017) # this leads obviously to much higher results (~ +0.5 f1 score with CoNLL-2003) model = Sequence(model_name, max_epoch=max_epoch, recurrent_dropout=recurrent_dropout, embeddings_name=embedding_name, early_stop=False, fold_number=fold_count, model_type=architecture, word_lstm_units=word_lstm_units, batch_size=batch_size, use_ELMo=use_ELMo, use_BERT=use_BERT) elif (dataset_type == 'ontonotes-all') and (lang == 'en'): print('Loading Ontonotes 5.0 XML data...') x_all, y_all = load_data_and_labels_ontonotes(data_path) x_train_all, x_eval, y_train_all, y_eval = train_test_split( x_all, y_all, test_size=0.1) x_train, x_valid, y_train, y_valid = train_test_split(x_train_all, y_train_all, test_size=0.1) stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval) model_name = 'ner-en-ontonotes' if use_ELMo: model_name += '-with_ELMo' elif use_BERT: model_name += '-with_BERT' model_name += '-' + architecture model = Sequence(model_name, max_epoch=60, recurrent_dropout=recurrent_dropout, embeddings_name=embedding_name, early_stop=True, fold_number=fold_count, model_type=architecture, word_lstm_units=word_lstm_units, batch_size=batch_size, use_ELMo=use_ELMo, use_BERT=use_BERT) elif (dataset_type == 'conll2012') and (lang == 'en'): print('Loading Ontonotes 5.0 CoNLL-2012 NER data...') x_train, y_train = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2012-NER/eng.train') x_valid, y_valid = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2012-NER/eng.dev') x_eval, y_eval = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2012-NER/eng.test') stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval) model_name = 'ner-en-conll2012' if use_ELMo: model_name += '-with_ELMo' elif use_BERT: model_name += '-with_BERT' model_name += '-' + architecture if not train_with_validation_set: model = Sequence(model_name, max_epoch=80, recurrent_dropout=recurrent_dropout, embeddings_name=embedding_name, early_stop=True, fold_number=fold_count, model_type=architecture, word_lstm_units=word_lstm_units, batch_size=batch_size, use_ELMo=use_ELMo, use_BERT=use_BERT) else: # also use validation set to train (no early stop, hyperparmeters must be set preliminarly), # as (Chui & Nochols, 2016) and (Peters and al., 2017) # this leads obviously to much higher results model = Sequence(model_name, max_epoch=40, recurrent_dropout=recurrent_dropout, embeddings_name=embedding_name, early_stop=False, fold_number=fold_count, model_type=architecture, word_lstm_units=word_lstm_units, batch_size=batch_size, use_ELMo=use_ELMo, use_BERT=use_BERT) elif (lang == 'fr'): print('Loading data...') dataset_type = 'lemonde' x_all, y_all = load_data_and_labels_lemonde( 'data/sequenceLabelling/leMonde/ftb6_ALL.EN.docs.relinked.xml') x_train_all, x_eval, y_train_all, y_eval = train_test_split( x_all, y_all, test_size=0.1) x_train, x_valid, y_train, y_valid = train_test_split(x_train_all, y_train_all, test_size=0.1) stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval) model_name = 'ner-fr-lemonde' if use_ELMo: model_name += '-with_ELMo' # custom batch size for French ELMo batch_size = 20 elif use_BERT: # need to find a French BERT :/ model_name += '-with_BERT' model_name += '-' + architecture model = Sequence(model_name, max_epoch=60, recurrent_dropout=recurrent_dropout, embeddings_name=embedding_name, early_stop=True, fold_number=fold_count, model_type=architecture, word_lstm_units=word_lstm_units, batch_size=batch_size, use_ELMo=use_ELMo, use_BERT=use_BERT) else: print("dataset/language combination is not supported:", dataset_type, lang) return start_time = time.time() if fold_count == 1: model.train(x_train, y_train, x_valid, y_valid) else: model.train_nfold(x_train, y_train, x_valid, y_valid, fold_number=fold_count) runtime = round(time.time() - start_time, 3) print("training runtime: %s seconds " % (runtime)) print("\nEvaluation on test set:") model.eval(x_eval, y_eval) # saving the model model.save()
def train(embedding_name, dataset_type='conll2003', lang='en', architecture='BidLSTM_CRF', use_ELMo=False, use_BERT=False, data_path=None): if (architecture == "BidLSTM_CNN_CRF"): word_lstm_units = 200 recurrent_dropout = 0.5 else: word_lstm_units = 100 recurrent_dropout = 0.5 if use_ELMo: batch_size = 100 else: batch_size = 20 if (dataset_type == 'conll2003') and (lang == 'en'): print('Loading data...') x_train1, y_train1 = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2003/eng.train') x_train2, y_train2 = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2003/eng.testa') x_train3, y_train3 = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2003/eng.testb') # we concatenate all sets x_all = np.concatenate((x_train1, x_train2, x_train3), axis=0) y_all = np.concatenate((y_train1, y_train2, y_train3), axis=0) # split train and valid sets in a random way x_train, x_valid, y_train, y_valid = train_test_split(x_all, y_all, test_size=0.1) stats(x_train, y_train, x_valid, y_valid) model_name = 'ner-en-conll2003' if use_ELMo: model_name += '-with_ELMo' elif use_BERT: model_name += '-with_BERT' model_name += '-' + architecture model = Sequence(model_name, max_epoch=60, recurrent_dropout=recurrent_dropout, embeddings_name=embedding_name, model_type=architecture, word_lstm_units=word_lstm_units, batch_size=batch_size, use_ELMo=use_ELMo, use_BERT=use_BERT) elif (dataset_type == 'conll2012') and (lang == 'en'): print('Loading Ontonotes 5.0 CoNLL-2012 NER data...') x_train1, y_train1 = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2012-NER/eng.train') x_train2, y_train2 = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2012-NER/eng.dev') x_train3, y_train3 = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2012-NER/eng.test') # we concatenate train and valid sets x_all = np.concatenate((x_train1, x_train2, x_train3), axis=0) y_all = np.concatenate((y_train1, y_train2, y_train3), axis=0) # split train and valid sets in a random way x_train, x_valid, y_train, y_valid = train_test_split(x_all, y_all, test_size=0.1) stats(x_train, y_train, x_valid, y_valid) model_name = 'ner-en-conll2012' if use_ELMo: model_name += '-with_ELMo' elif use_BERT: model_name += '-with_BERT' model_name += '-' + architecture model = Sequence(model_name, max_epoch=80, recurrent_dropout=0.20, embeddings_name=embedding_name, early_stop=True, model_type=architecture, word_lstm_units=word_lstm_units, batch_size=batch_size, use_ELMo=use_ELMo, use_BERT=use_BERT) elif (lang == 'fr'): print('Loading data...') dataset_type = 'lemonde' x_all, y_all = load_data_and_labels_lemonde( 'data/sequenceLabelling/leMonde/ftb6_ALL.EN.docs.relinked.xml') x_train, x_valid, y_train, y_valid = train_test_split(x_all, y_all, test_size=0.1) stats(x_train, y_train, x_valid, y_valid) model_name = 'ner-fr-lemonde' if use_ELMo: model_name += '-with_ELMo' elif use_BERT: model_name += '-with_BERT' model_name += '-' + architecture model = Sequence(model_name, max_epoch=60, recurrent_dropout=recurrent_dropout, embeddings_name=embedding_name, model_type=architecture, word_lstm_units=word_lstm_units, batch_size=batch_size, use_ELMo=use_ELMo, use_BERT=use_BERT) else: print("dataset/language combination is not supported:", dataset_type, lang) return #elif (dataset_type == 'ontonotes') and (lang == 'en'): # model = sequenceLabelling.Sequence('ner-en-ontonotes', max_epoch=60, embeddings_name=embedding_name) #elif (lang == 'fr'): # model = sequenceLabelling.Sequence('ner-fr-lemonde', max_epoch=60, embeddings_name=embedding_name) start_time = time.time() model.train(x_train, y_train, x_valid, y_valid) runtime = round(time.time() - start_time, 3) print("training runtime: %s seconds " % (runtime)) # saving the model model.save()
def migrate(): # grobid models for grobid_model in GROBID_MODEL: model_name = 'grobid-' + grobid_model # load the model """ print(os.path.join(DATA_PATH, model_name)) if os.path.isdir(os.path.join(DATA_PATH, model_name)): model = Sequence(model_name) model.load() model.save() """ # with ELMo """ if os.path.isdir(os.path.join(DATA_PATH, model_name+'-with_ELMo')): model = Sequence(model_name+'-with_ELMo') model.load() model.save() """ # insult model """ model = Sequence('insult') model.load() model.save() """ for en_model in NER_MODELS_EN: for architecture in ARCHITECTURE: """ model_name = 'ner-en-' + en_model model_name += '-' + architecture if os.path.isdir(os.path.join(DATA_PATH, model_name)): model = Sequence(model_name) model.load() model.save() """ """ model_name = 'ner-en-' + en_model model_name += '-with_ELMo' model_name += '-' + architecture if os.path.isdir(os.path.join(DATA_PATH, model_name)): model = Sequence(model_name) model.load() model.save() """ """ model_name = 'ner-en-' + en_model model_name += '-with_BERT' model_name += '-' + architecture if os.path.isdir(os.path.join(DATA_PATH, model_name)): model = Sequence(model_name) model.load() model.save() """ for fr_model in NER_MODELS_FR: for architecture in ARCHITECTURE: model_name = 'ner-fr-' + fr_model model_name += '-' + architecture print(os.path.join(DATA_PATH, model_name)) if os.path.isdir(os.path.join(DATA_PATH, model_name)): model = Sequence(model_name) model.load() model.save() model_name = 'ner-fr-' + fr_model model_name += '-with_ELMo' model_name += '-' + architecture if os.path.isdir(os.path.join(DATA_PATH, model_name)): model = Sequence(model_name) model.load() model.save()
def train_eval(model, embeddings_name, architecture='BidLSTM_CRF', use_ELMo=False, input_path=None, output_path=None, fold_count=1): print('Loading data...') if input_path is None: x_all, y_all, f_all = load_data_and_labels_crf_file( 'data/sequenceLabelling/grobid/' + model + '/' + model + '-060518.train') else: x_all, y_all, f_all = load_data_and_labels_crf_file(input_path) x_train_all, x_eval, y_train_all, y_eval = train_test_split(x_all, y_all, test_size=0.1) x_train, x_valid, y_train, y_valid = train_test_split(x_train_all, y_train_all, test_size=0.1) print(len(x_train), 'train sequences') print(len(x_valid), 'validation sequences') print(len(x_eval), 'evaluation sequences') if output_path: model_name = model else: model_name = 'grobid-' + model batch_size = 20 max_sequence_length = 3000 if use_ELMo: model_name += '-with_ELMo' if model_name == 'software-with_ELMo' or model_name == 'grobid-software-with_ELMo': batch_size = 3 model = Sequence(model_name, max_epoch=100, recurrent_dropout=0.50, embeddings_name=embeddings_name, model_type=architecture, use_ELMo=use_ELMo, max_sequence_length=max_sequence_length, batch_size=batch_size, fold_number=fold_count) start_time = time.time() if fold_count == 1: model.train(x_train, y_train, x_valid, y_valid) else: model.train_nfold(x_train, y_train, x_valid, y_valid, fold_number=fold_count) runtime = round(time.time() - start_time, 3) print("training runtime: %s seconds " % (runtime)) # evaluation print("\nEvaluation:") model.eval(x_eval, y_eval) # saving the model if (output_path): model.save(output_path) else: model.save()
def run_eval_txt(xml_repo_path, model, nb_threads=1, use_ELMo=False): # load the model # load model model_name = 'grobid-' + model if use_ELMo: model_name += '-with_ELMo' model = Sequence(model_name) model.load() if not use_ELMo: model.model_config.batch_size = 200 start_time = time.time() # acquisition of texts texts = [] nb_texts = 0 nb_tokens = 0 nb_files = 0 for (dirpath, dirnames, filenames) in os.walk(xml_repo_path): for filename in filenames: if filename.endswith('.xml') or filename.endswith('.tei'): #try: tree = ET.parse(os.path.join(dirpath, filename)) #except: # print("XML parsing error with", filename) for paragraph in tree.findall( ".//{http://www.tei-c.org/ns/1.0}p"): #texts.append(paragraph.text) text = ET.tostring(paragraph, encoding='utf-8', method='text').decode('utf-8') text = text.replace("\n", " ") text = text.replace("\t", " ") test = re.sub(r'( )+', ' ', text.strip()) texts.append(text.strip()) nb_texts += 1 nb_tokens += len(pattern.split(text)) if len(texts) == model.model_config.batch_size: process_batch_txt(texts, model, nb_threads) texts = [] nb_files += 1 if nb_files > 50: break # last batch if len(texts) > 0: process_batch_txt(texts, model, nb_threads) print("-----------------------------") print("nb xml files:", nb_files) print("nb texts:", nb_texts) print("nb tokens:", nb_tokens) runtime = round(time.time() - start_time, 4) print("-----------------------------") print("total runtime: %s seconds " % (runtime)) print("-----------------------------") print("xml files/s:\t {:.4f}".format(nb_files / runtime)) print(" texts/s:\t {:.4f}".format(nb_texts / runtime)) print(" tokens/s:\t {:.4f}".format(nb_tokens / runtime))