def train(factrueval2016_devset_dir: str, split_by_paragraphs: bool, bert_will_be_tuned: bool, lstm_layer_size: Union[int, None], max_epochs: int, batch_size: int, gpu_memory_frac: float, model_name: str) -> BERT_NER: if os.path.isfile(model_name): with open(model_name, 'rb') as fp: recognizer = pickle.load(fp) assert isinstance(recognizer, BERT_NER) print('The NER has been successfully loaded from the file `{0}`...'. format(model_name)) print('') else: temp_json_name = tempfile.NamedTemporaryFile(mode='w').name try: factrueval2016_to_json(factrueval2016_devset_dir, temp_json_name, split_by_paragraphs) X, y = load_dataset(temp_json_name) finally: if os.path.isfile(temp_json_name): os.remove(temp_json_name) print('Data for training have been loaded...') print('Number of samples is {0}.'.format(len(y))) print('') if BERT_NER.PATH_TO_BERT is None: bert_hub_module_handle = 'https://tfhub.dev/google/bert_multi_cased_L-12_H-768_A-12/1' else: bert_hub_module_handle = None recognizer = BERT_NER(finetune_bert=bert_will_be_tuned, batch_size=batch_size, l2_reg=1e-3, bert_hub_module_handle=bert_hub_module_handle, lstm_units=lstm_layer_size, validation_fraction=0.25, max_epochs=max_epochs, patience=3, gpu_memory_frac=gpu_memory_frac, verbose=True, random_seed=42, lr=1e-5 if bert_will_be_tuned else 1e-3) recognizer.fit(X, y) with open(model_name, 'wb') as fp: pickle.dump(recognizer, fp) print('') print( 'The NER has been successfully fitted and saved into the file `{0}`...' .format(model_name)) print('') return recognizer
def train(train_file_name: str, valid_file_name: str, split_by_paragraphs: bool, bert_will_be_tuned: bool, lstm_layer_size: Union[int, None], l2: float, max_epochs: int, batch_size: int, gpu_memory_frac: float, model_name: str) -> BERT_NER: if os.path.isfile(model_name): with open(model_name, 'rb') as fp: recognizer = pickle.load(fp) assert isinstance(recognizer, BERT_NER) print('The NER has been successfully loaded from the file `{0}`...'. format(model_name)) print('') else: X_train, y_train = load_dataset_from_bio( train_file_name, paragraph_separators=({'-DOCSTART-'} if split_by_paragraphs else None), stopwords={'-DOCSTART-'}) X_val, y_val = load_dataset_from_bio( valid_file_name, paragraph_separators=({'-DOCSTART-'} if split_by_paragraphs else None), stopwords={'-DOCSTART-'}) print( 'The CoNLL-2003 data for training and validation have been loaded...' ) print('Number of samples for training is {0}.'.format(len(y_train))) print('Number of samples for validation is {0}.'.format(len(y_val))) print('') if BERT_NER.PATH_TO_BERT is None: bert_hub_module_handle = 'https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1' else: bert_hub_module_handle = None recognizer = BERT_NER(finetune_bert=bert_will_be_tuned, batch_size=batch_size, l2_reg=l2, bert_hub_module_handle=bert_hub_module_handle, lstm_units=lstm_layer_size, max_epochs=max_epochs, patience=5, gpu_memory_frac=gpu_memory_frac, verbose=True, random_seed=42, lr=1e-6 if bert_will_be_tuned else 1e-4) recognizer.fit(X_train, y_train, validation_data=(X_val, y_val)) print('') print( 'The NER has been successfully fitted and saved into the file `{0}`...' .format(model_name)) y_pred = recognizer.predict(X_val) f1, precision, recall, quality_by_entities = calculate_prediction_quality( y_val, y_pred, classes_list=recognizer.classes_list_) print('All entities:') print(' F1-score is {0:.2%}.'.format(f1)) print(' Precision is {0:.2%}.'.format(precision)) print(' Recall is {0:.2%}.'.format(recall)) for ne_type in sorted(list(quality_by_entities.keys())): print(' {0}'.format(ne_type)) print(' F1-score is {0:.2%}.'.format( quality_by_entities[ne_type][0])) print(' Precision is {0:.2%}.'.format( quality_by_entities[ne_type][1])) print(' Recall is {0:.2%}.'.format( quality_by_entities[ne_type][2])) print('') with open(model_name, 'wb') as fp: pickle.dump(recognizer, fp) return recognizer
def train(factrueval2016_devset_dir: str, split_by_paragraphs: bool, bert_will_be_tuned: bool, use_lang_features: bool, use_shapes: bool, lstm_layer_size: Union[int, None], l2: float, max_epochs: int, patience: int, batch_size: int, gpu_memory_frac: float, model_name: str, collection3_dir: Union[str, None] = None, n_max_samples: int = 0) -> BERT_NER: if os.path.isfile(model_name): with open(model_name, 'rb') as fp: recognizer = pickle.load(fp) assert isinstance(recognizer, BERT_NER) print('The NER has been successfully loaded from the file `{0}`...'. format(model_name)) print('') else: temp_json_name = tempfile.NamedTemporaryFile(mode='w').name try: factrueval2016_to_json(factrueval2016_devset_dir, temp_json_name, split_by_paragraphs) X, y = load_dataset_from_json(temp_json_name) finally: if os.path.isfile(temp_json_name): os.remove(temp_json_name) print('The FactRuEval-2016 data for training have been loaded...') print('Number of samples is {0}.'.format(len(y))) print('') if BERT_NER.PATH_TO_BERT is None: bert_hub_module_handle = 'https://tfhub.dev/google/bert_multi_cased_L-12_H-768_A-12/1' else: bert_hub_module_handle = None recognizer = BERT_NER(finetune_bert=bert_will_be_tuned, batch_size=batch_size, l2_reg=l2, bert_hub_module_handle=bert_hub_module_handle, lstm_units=lstm_layer_size, validation_fraction=0.25, max_epochs=max_epochs, patience=patience, gpu_memory_frac=gpu_memory_frac, verbose=True, random_seed=42, lr=3e-6 if bert_will_be_tuned else 1e-4, udpipe_lang='ru', use_nlp_features=use_lang_features, use_shapes=use_shapes) if collection3_dir is None: if n_max_samples > 0: train_index, test_index = split_dataset( y=y, test_part=recognizer.validation_fraction) X_train = np.array(X, dtype=object)[train_index] y_train = np.array(y, dtype=object)[train_index] X_val = np.array(X, dtype=object)[test_index] y_val = np.array(y, dtype=object)[test_index] del train_index, test_index index = sample_from_dataset(y=y_train, n=n_max_samples) recognizer.fit(X_train[index], y_train[index], validation_data=(X_val, y_val)) else: recognizer.fit(X, y) else: X_train, y_train = load_dataset_from_brat(collection3_dir, split_by_paragraphs=True) if not split_by_paragraphs: X_train, y_train = divide_dataset_by_sentences( X_train, y_train, sent_tokenize_func=ru_sent_tokenize) for sample_idx in range(len(y_train)): new_y_sample = dict() for ne_type in sorted(list(y_train[sample_idx].keys())): if ne_type == 'PER': new_y_sample['PERSON'] = y_train[sample_idx][ne_type] elif ne_type == 'LOC': new_y_sample['LOCATION'] = y_train[sample_idx][ne_type] else: new_y_sample[ne_type] = y_train[sample_idx][ne_type] y_train[sample_idx] = new_y_sample del new_y_sample print('The Collection3 data for training have been loaded...') print('Number of samples is {0}.'.format(len(y_train))) print('') if n_max_samples > 0: index = sample_from_dataset(y=y_train, n=n_max_samples) X_train = np.array(X_train, dtype=object)[index] y_train = np.array(y_train, dtype=object)[index] del index recognizer.fit(X_train, y_train, validation_data=(X, y)) with open(model_name, 'wb') as fp: pickle.dump(recognizer, fp) print('') print( 'The NER has been successfully fitted and saved into the file `{0}`...' .format(model_name)) print('') return recognizer