def test_negative01(self): y = [ {"LOCATION": [(55, 63), (66, 84), (87, 93)], "PERSON": [(281, 289)]} ] true_err_msg = re.escape('There are too few samples in the data set! Minimal number of samples is 2.') with self.assertRaisesRegex(ValueError, true_err_msg): _, _ = split_dataset(y, 0.3333, n_restarts=4)
def test_negative02(self): X = np.array([ '01abc', '02def', '03ghi', '04jkl', '05mno', '06pqr', '07stu', '08vwx', '09yza', '10bcd', '11efg', '12hij' ], dtype=np.str) y_tokenized = np.array([ [0, 0, 2, 1, 1, 0, 0, 0, 2, 0, 4, 3, 0], [2, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 4, 0, 0, 4, 3, 0, 0, 0, 0, 0], [4, 3, 0, 4, 3, 3, 0, 0, 0, 0, 0, 2, 1], [0, 0, 0, 2, 1, 0, 0, 0, 4, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 6, 5, 5, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 6, 5, 4, 3, 0, 0, 0], [0, 0, 0, 4, 4, 0, 0, 0, 0, 0, 2, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 3, 0], [0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 2, 1, 1], ], dtype=np.int32) true_err_msg = re.escape( '{0} is too small value of the test part! ' 'There are no samples for testing subset!'.format(0.01)) with self.assertRaisesRegex(ValueError, true_err_msg): _, _ = split_dataset(X, y_tokenized, 0.01, n_restarts=4, random_seed=0)
def test_negative04(self): X = np.array([ '01abc', '02def', '03ghi', '04jkl', '05mno', '06pqr', '07stu', '08vwx', '09yza', '10bcd', '11efg', '12hij' ], dtype=np.str) y_tokenized = np.array([ [0, 0, 2, 1, 1, 0, 0, 0, 2, 0, 4, 3, 0], [2, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 4, 0, 0, 4, 3, 0, 0, 0, 0, 0], [4, 3, 0, 4, 3, 3, 0, 0, 0, 0, 0, 2, 1], [0, 0, 0, 2, 1, 0, 0, 0, 4, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 6, 5, 5, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 6, 5, 4, 3, 0, 0, 0], [0, 0, 0, 4, 4, 0, 0, 0, 0, 0, 2, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 3, 0], [0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 2, 1, 1], ], dtype=np.int32) true_err_msg = re.escape( '1 is too small value of restarts number. It must be greater than 1.' ) with self.assertRaisesRegex(ValueError, true_err_msg): _, _ = split_dataset(X, y_tokenized, 0.3333, n_restarts=1, random_seed=0)
def test_negative04(self): y = [ {"LOCATION": [(55, 63), (66, 84), (87, 93)], "PERSON": [(281, 289)]}, {"PERSON": [(33, 44)], "LOCATION": [(198, 204), (189, 197), (168, 185)], "ORG": [(230, 249)]}, {"PERSON": [(87, 98)], "ORG": [(18, 42), (18, 56)]}, {"LOCATION": [(151, 157)], "PERSON": [(130, 140)]} ] true_err_msg = re.escape('1 is too small value of restarts number. It must be greater than 1.') with self.assertRaisesRegex(ValueError, true_err_msg): _, _ = split_dataset(y, 0.3333, n_restarts=1)
def test_negative03(self): y = [ {"LOCATION": [(55, 63), (66, 84), (87, 93)], "PERSON": [(281, 289)]}, {"PERSON": [(33, 44)], "LOCATION": [(198, 204), (189, 197), (168, 185)], "ORG": [(230, 249)]}, {"PERSON": [(87, 98)], "ORG": [(18, 42), (18, 56)]}, {"LOCATION": [(151, 157)], "PERSON": [(130, 140)]} ] true_err_msg = re.escape('{0} is too large value of the test part! ' 'There are no samples for training subset!'.format(0.99)) with self.assertRaisesRegex(ValueError, true_err_msg): _, _ = split_dataset(y, 0.99, n_restarts=4)
def test_negative01(self): X = np.array([ '01abc', ], dtype=np.str) y_tokenized = np.array([ [0, 0, 2, 1, 1, 0, 0, 0, 2, 0, 4, 3, 0], ], dtype=np.int32) true_err_msg = re.escape( 'There are too few samples in the data set! Minimal number of samples is 2.' ) with self.assertRaisesRegex(ValueError, true_err_msg): _, _ = split_dataset(X, y_tokenized, 0.3333, n_restarts=4, random_seed=0)
def test_positive01(self): base_dir = os.path.join(os.path.dirname(__file__), 'testdata') _, y = load_dataset_from_json(os.path.join(base_dir, 'true_named_entities.json')) train_index, test_index = split_dataset(y, 0.3, 10) self.assertIsInstance(train_index, np.ndarray) self.assertIsInstance(test_index, np.ndarray) self.assertEqual(len(y), len(train_index) + len(test_index)) self.assertEqual(len(train_index), len(set(train_index.tolist()))) self.assertEqual(len(test_index), len(set(test_index.tolist()))) self.assertEqual(0, len(set(train_index.tolist()) & set(test_index.tolist()))) true_set_of_classes = {'ORG', 'PERSON', 'LOCATION'} set_of_classes_for_training = set() for idx in train_index: set_of_classes_for_training |= set(y[idx].keys()) set_of_classes_for_testing = set() for idx in test_index: set_of_classes_for_testing |= set(y[idx].keys()) self.assertEqual(set_of_classes_for_training, set_of_classes_for_testing) self.assertEqual(true_set_of_classes, set_of_classes_for_training) self.assertEqual(true_set_of_classes, set_of_classes_for_testing)
def test_positive01(self): X = np.array([ '01abc', '02def', '03ghi', '04jkl', '05mno', '06pqr', '07stu', '08vwx', '09yza', '10bcd', '11efg', '12hij' ], dtype=np.str) y_tokenized = np.array( [ [0, 0, 2, 1, 1, 0, 0, 0, 2, 0, 4, 3, 0], # 0 1 2 3 4 # 0 2 3 4 [0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 0 2 6 # 0 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 0 # 0 [0, 0, 0, 4, 0, 0, 4, 3, 0, 0, 0, 0, 0], # 0 3 4 # 0 3 4 [4, 3, 0, 4, 3, 3, 0, 0, 0, 0, 0, 2, 1], # 0 1 2 3 4 # 0 2 3 4 [0, 0, 0, 2, 1, 0, 0, 0, 4, 0, 0, 0, 0], # 0 1 2 4 # 0 2 4 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 0 # 0 [0, 0, 0, 6, 5, 5, 0, 0, 0, 0, 0, 0, 0], # 0 5 6 # 0 [0, 0, 0, 0, 0, 0, 6, 5, 4, 3, 0, 0, 0], # 0 3 4 5 6 # 0 3 4 [0, 0, 0, 4, 4, 0, 0, 0, 0, 0, 2, 0, 0], # 0 2 4 # 0 2 4 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 3, 0], # 0 3 4 # 0 3 4 [0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 2, 1, 1], # 0 1 2 4 # 0 2 4 ], dtype=np.int32) # 0: 120 # 1: 6 # 2: 7 # 3: 7 # 4: 11 # 5: 3 # 6: 3 true_indices_for_training = np.array([1, 4, 5, 6, 7, 8, 10, 11], dtype=np.int32) true_indices_for_testing = np.array([0, 2, 3, 9], dtype=np.int32) calc_indices_for_training, calc_indices_for_testing = split_dataset( X, y_tokenized, 0.3333, n_restarts=4, random_seed=0) self.assertIsInstance(calc_indices_for_training, np.ndarray) self.assertIsInstance(calc_indices_for_testing, np.ndarray) self.assertEqual(true_indices_for_training.tolist(), calc_indices_for_training.tolist()) self.assertEqual(true_indices_for_testing.tolist(), calc_indices_for_testing.tolist())
def train(factrueval2016_devset_dir: str, split_by_paragraphs: bool, bert_will_be_tuned: bool, use_lang_features: bool, use_shapes: bool, lstm_layer_size: Union[int, None], l2: float, max_epochs: int, patience: int, batch_size: int, gpu_memory_frac: float, model_name: str, collection3_dir: Union[str, None] = None, n_max_samples: int = 0) -> BERT_NER: if os.path.isfile(model_name): with open(model_name, 'rb') as fp: recognizer = pickle.load(fp) assert isinstance(recognizer, BERT_NER) print('The NER has been successfully loaded from the file `{0}`...'. format(model_name)) print('') else: temp_json_name = tempfile.NamedTemporaryFile(mode='w').name try: factrueval2016_to_json(factrueval2016_devset_dir, temp_json_name, split_by_paragraphs) X, y = load_dataset_from_json(temp_json_name) finally: if os.path.isfile(temp_json_name): os.remove(temp_json_name) print('The FactRuEval-2016 data for training have been loaded...') print('Number of samples is {0}.'.format(len(y))) print('') if BERT_NER.PATH_TO_BERT is None: bert_hub_module_handle = 'https://tfhub.dev/google/bert_multi_cased_L-12_H-768_A-12/1' else: bert_hub_module_handle = None recognizer = BERT_NER(finetune_bert=bert_will_be_tuned, batch_size=batch_size, l2_reg=l2, bert_hub_module_handle=bert_hub_module_handle, lstm_units=lstm_layer_size, validation_fraction=0.25, max_epochs=max_epochs, patience=patience, gpu_memory_frac=gpu_memory_frac, verbose=True, random_seed=42, lr=3e-6 if bert_will_be_tuned else 1e-4, udpipe_lang='ru', use_nlp_features=use_lang_features, use_shapes=use_shapes) if collection3_dir is None: if n_max_samples > 0: train_index, test_index = split_dataset( y=y, test_part=recognizer.validation_fraction) X_train = np.array(X, dtype=object)[train_index] y_train = np.array(y, dtype=object)[train_index] X_val = np.array(X, dtype=object)[test_index] y_val = np.array(y, dtype=object)[test_index] del train_index, test_index index = sample_from_dataset(y=y_train, n=n_max_samples) recognizer.fit(X_train[index], y_train[index], validation_data=(X_val, y_val)) else: recognizer.fit(X, y) else: X_train, y_train = load_dataset_from_brat(collection3_dir, split_by_paragraphs=True) if not split_by_paragraphs: X_train, y_train = divide_dataset_by_sentences( X_train, y_train, sent_tokenize_func=ru_sent_tokenize) for sample_idx in range(len(y_train)): new_y_sample = dict() for ne_type in sorted(list(y_train[sample_idx].keys())): if ne_type == 'PER': new_y_sample['PERSON'] = y_train[sample_idx][ne_type] elif ne_type == 'LOC': new_y_sample['LOCATION'] = y_train[sample_idx][ne_type] else: new_y_sample[ne_type] = y_train[sample_idx][ne_type] y_train[sample_idx] = new_y_sample del new_y_sample print('The Collection3 data for training have been loaded...') print('Number of samples is {0}.'.format(len(y_train))) print('') if n_max_samples > 0: index = sample_from_dataset(y=y_train, n=n_max_samples) X_train = np.array(X_train, dtype=object)[index] y_train = np.array(y_train, dtype=object)[index] del index recognizer.fit(X_train, y_train, validation_data=(X, y)) with open(model_name, 'wb') as fp: pickle.dump(recognizer, fp) print('') print( 'The NER has been successfully fitted and saved into the file `{0}`...' .format(model_name)) print('') return recognizer
def train(factrueval2016_devset_dir: str, split_by_paragraphs: bool, elmo_will_be_tuned: bool, use_lang_features: bool, use_shapes: bool, max_epochs: int, patience: int, batch_size: int, lr: float, l2: float, gpu_memory_frac: float, model_name: str, collection3_dir: Union[str, None] = None, n_max_samples: int = 0) -> ELMo_NER: if os.path.isfile(model_name): with open(model_name, 'rb') as fp: recognizer = pickle.load(fp) assert isinstance(recognizer, ELMo_NER) print('The NER has been successfully loaded from the file `{0}`...'. format(model_name)) print('') else: temp_json_name = tempfile.NamedTemporaryFile(mode='w').name try: factrueval2016_to_json(factrueval2016_devset_dir, temp_json_name, split_by_paragraphs) X, y = load_dataset_from_json(temp_json_name) finally: if os.path.isfile(temp_json_name): os.remove(temp_json_name) print('The FactRuEval-2016 data for training have been loaded...') print('Number of samples is {0}.'.format(len(y))) print('') max_number_of_tokens = 0 pipeline = create_udpipe_pipeline('ru') for cur in X: spacy_doc = pipeline(cur) n_tokens = 0 for _ in spacy_doc: n_tokens += 1 del spacy_doc if n_tokens > max_number_of_tokens: max_number_of_tokens = n_tokens del pipeline print('Maximal number of tokens is {0}.'.format(max_number_of_tokens)) n_tokens = 2 while n_tokens < max_number_of_tokens: n_tokens *= 2 elmo_hub_module_handle = 'http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-news_wmt11-16_1.5M_steps.tar.gz' recognizer = ELMo_NER(finetune_elmo=elmo_will_be_tuned, batch_size=batch_size, l2_reg=l2, max_seq_length=n_tokens, elmo_hub_module_handle=elmo_hub_module_handle, validation_fraction=0.25, max_epochs=max_epochs, patience=patience, gpu_memory_frac=gpu_memory_frac, verbose=True, random_seed=42, lr=lr, udpipe_lang='ru', use_nlp_features=use_lang_features, use_shapes=use_shapes) if collection3_dir is None: if n_max_samples > 0: train_index, test_index = split_dataset( y=y, test_part=recognizer.validation_fraction) X_train = np.array(X, dtype=object)[train_index] y_train = np.array(y, dtype=object)[train_index] X_val = np.array(X, dtype=object)[test_index] y_val = np.array(y, dtype=object)[test_index] del train_index, test_index index = sample_from_dataset(y=y_train, n=n_max_samples) recognizer.fit(X_train[index], y_train[index], validation_data=(X_val, y_val)) recognizer.fit(X, y) else: X_train, y_train = load_dataset_from_brat(collection3_dir, split_by_paragraphs=True) if not split_by_paragraphs: X_train, y_train = divide_dataset_by_sentences( X_train, y_train, sent_tokenize_func=ru_sent_tokenize) for sample_idx in range(len(y_train)): new_y_sample = dict() for ne_type in sorted(list(y_train[sample_idx].keys())): if ne_type == 'PER': new_y_sample['PERSON'] = y_train[sample_idx][ne_type] elif ne_type == 'LOC': new_y_sample['LOCATION'] = y_train[sample_idx][ne_type] else: new_y_sample[ne_type] = y_train[sample_idx][ne_type] y_train[sample_idx] = new_y_sample del new_y_sample print('The Collection3 data for training have been loaded...') print('Number of samples is {0}.'.format(len(y_train))) print('') if n_max_samples > 0: index = sample_from_dataset(y=y_train, n=n_max_samples) X_train = np.array(X_train, dtype=object)[index] y_train = np.array(y_train, dtype=object)[index] del index recognizer.fit(X_train, y_train, validation_data=(X, y)) with open(model_name, 'wb') as fp: pickle.dump(recognizer, fp) print('') print( 'The NER has been successfully fitted and saved into the file `{0}`...' .format(model_name)) print('') return recognizer