def test_split_dataset(self): def split_tests(data_: TargetCollection, train_: TargetCollection, test_: TargetCollection, test_split: float): data_size = len(data_) train_size = len(train_) test_size = len(test_) assert train_size == (data_size - int(data_size * test_split)) assert test_size == int(data_size * test_split) assert data_size == (train_size + test_size) train_ids = [] test_ids = [] for data in train_.data_dict(): train_ids.append(re.findall('\d+', data['target_id'])[0]) for data in test_.data_dict(): test_ids.append(re.findall('\d+', data['target_id'])[0]) assert len(train_ids) == len(set(train_ids)) assert len(test_ids) == len(set(test_ids)) for train_id in train_ids: assert train_id not in test_ids for test_id in test_ids: assert test_id not in train_ids return train_ids, test_ids test_dir = Path(__file__, '..', 'test_data') data_fp = Path(test_dir, 'semeval_test_data.xml') data = semeval_14(data_fp) train, test = data.split_dataset(data, test_split=0.2) _, test_ids_ = split_tests(data, train, test, 0.2) true_test_ids = ['20140', '20141', '20120', '20121', '20110'] assert true_test_ids == test_ids_ # Tests many different random splits for i in range(20): train, test = data.split_dataset(data, test_split=0.2, random=True) split_tests(data, train, test, 0.2) # Tests many different random splits but with a much larger test size for i in range(20): train, test = data.split_dataset(data, test_split=0.8, random=True) split_tests(data, train, test, 0.8)
print(f'Number of targets before {len(dong_dataset)}') new_dong_dataset = [] for value in dong_dataset.data_dict(): target_spans = value['spans'] if len(target_spans) > 1: target_spans = [target_spans[0]] value['spans'] = target_spans value['target'] = value['text'][ target_spans[0][0]:target_spans[0][1]] new_dong_dataset.append(bella.data_types.Target(**value)) dong_dataset = bella.data_types.TargetCollection(new_dong_dataset) temp_fp = config.neural_dataset_dir / f'Temp Dong {split_name}.xml' write_data.semeval_14(temp_fp, dong_dataset) dataset = semeval_2014(temp_fp, conflict=False) else: another_dataset = parsers.semeval_14( dataset_fp_mapper[f'{dataset_name} {split_name}']) print(f'Number of targets before {len(another_dataset)}') temp_fp = config.neural_dataset_dir / f'Temp {dataset_name} {split_name}.xml' write_data.semeval_14(temp_fp, another_dataset) dataset = semeval_2014(temp_fp, conflict=False) assert dataset is not None # Just making sure each sentence contains at least one target. assert dataset.one_sample_per_span( remove_empty=True).number_targets() == dataset.number_targets() assert len(dataset) == len(dataset.samples_with_targets()) print(f'Number of targets with new format {dataset.number_targets()}') if split_name == 'test': dataset.to_json_file(dataset_fp) continue # For reproducibility reasons random_state = 42
from allennlp.training.trainer import Trainer, TensorboardWriter from bella.parsers import semeval_14 import torch import torch.optim as optim from bella_allen_nlp.dataset_readers.target import TargetDatasetReader from bella_allen_nlp.allen_models.target_lstm import TargetLSTMClassifier logging.basicConfig(format='%(message)s', level=logging.INFO) sem_dir = Path('..', 'aspect datasets', 'semeval_2014') laptop_train = semeval_14(Path(sem_dir, 'laptop_train.xml'), name='Laptop') rest_train = semeval_14(Path(sem_dir, 'restaurants_train.xml'), name='Restaurant Train') rest_test = semeval_14(Path(sem_dir, 'restaurants_train.xml'), name='Restaurant Test') rest_train_fps = rest_train.to_json_file(['Restaurant Train', 'Restaurant Dev'], 0.2, random_state=42) rest_test_fp = rest_test.to_json_file('Restaurant Test') laptop_fps = laptop_train.to_json_file(['Laptop Train', 'Laptop Dev'], 0.2, random_state=42) rest_train_fp, rest_dev_fp = rest_train_fps laptop_train_fp, laptop_dev_fp = laptop_fps token_indexers = {'tokens': SingleIdTokenIndexer(namespace='tokens_id', lowercase_tokens=True)} reader = TargetDatasetReader(token_indexers=token_indexers)
'stores the data' data_splits_help = 'File path to the directory to store the dataset splits' parser.add_argument("data_dir", help=data_dir_help, type=parse_path) parser.add_argument("data_splits_dir", help=data_splits_help, type=parse_path) args = parser.parse_args() data_dir = args.data_dir rest_train_fp = Path(data_dir, 'Restaurants_Train_v2.xml') rest_test_fp = Path(data_dir, 'Restaurants_Test_Gold.xml') laptop_train_fp = Path(data_dir, 'Laptop_Train_v2.xml') laptop_test_fp = Path(data_dir, 'Laptops_Test_Gold.xml') election_dir = Path(data_dir, 'election') rest_train_data = semeval_14(rest_train_fp, name='Restaurant') rest_test_data = semeval_14(rest_test_fp, name='Restaurant Test') laptop_train_data = semeval_14(laptop_train_fp, name='Laptop') laptop_test_data = semeval_14(laptop_test_fp, name='Laptop Test') election_train = election_train(election_dir, name='Election') election_test = election_test(election_dir, name='Election Test') data_splits_dir: Path = args.data_splits_dir data_splits_dir.mkdir(parents=True, exist_ok=True) all_test_data = [rest_test_data, laptop_test_data, election_test] print('Test data') for test_data in all_test_data: dataset_name = test_data.name dataset_path = str(Path(data_splits_dir, dataset_name)) test_data.to_json_file(dataset_path, cache=False) print(f'{dataset_name}')
words but is for methods that require capitalisation. ''' new_dataset = [] for value in dataset.data_dict(): target_spans = value['spans'] new_target = None text = value['text'] assert len(target_spans) == 1 for span in target_spans: new_target = text[span[0] : span[1]] assert new_target is not None value['target'] = new_target new_dataset.append(bella.data_types.Target(**value)) return bella.data_types.TargetCollection(new_dataset) size_of_small = len(parsers.semeval_14(config.youtubean_train)) for dataset_name in dataset_names: dataset_fp = config.small_training_dataset_dir / f'{dataset_name} train.xml' neural_train_dataset_fp = config.neural_small_dataset_dir / f'{dataset_name} train.json' neural_val_dataset_fp = config.neural_small_dataset_dir / f'{dataset_name} validation.json' if dataset_fp.exists(): continue dataset = None if dataset_name == 'Election': dataset = parsers.election_train(config.ELECTION, name='Election Train') dataset = get_targets_from_spans(dataset) elif dataset_name == 'Dong': dataset = parsers.dong(dataset_fp_mapper[f'{dataset_name} train']) new_dong_dataset = [] for value in dataset.data_dict():
class TestAllenNLPModel(): test_dir = Path(__file__, '..', 'test_data') train_data_fp = Path(test_dir, 'data', 'target_collection_train_data.xml') test_data_fp = Path(test_dir, 'data', 'target_collection_test_data.xml') unseen_data_fp = Path(test_dir, 'data', 'unseen_data_for_predictions.xml') TARGET_DATA = semeval_14(test_data_fp, name='test data') TARGET_TRAIN_DATA = semeval_14(train_data_fp, name='train data') UNSEEN_DATA = semeval_14(unseen_data_fp, name='unseen') test_model_dir = Path(test_dir, 'model_configs') MODEL_TARGET_FP = Path(test_model_dir, 'test_target_model_config.json') MODEL_TDLSTM_FP = Path(test_model_dir, 'test_tdlstm_model_config.json') model_dir = Path(test_dir, 'saved_models') SAVED_TARGET_MODEL = Path(model_dir, 'target_model') SAVED_TDLSTM_MODEL = Path(model_dir, 'tdlstm_model') def test_repr_test(self): model = AllenNLPModel('ML', self.MODEL_TARGET_FP) model_repr = model.__repr__() assert model_repr == 'ML' def test_fitted(self): model = AllenNLPModel('ML', self.MODEL_TARGET_FP) assert not model.fitted model.fitted = True assert model.fitted @flaky(max_runs=5) @pytest.mark.parametrize("target_model", [(MODEL_TARGET_FP, SAVED_TARGET_MODEL), (MODEL_TDLSTM_FP, SAVED_TDLSTM_MODEL)]) def test_probabilities(self, target_model): data = self.UNSEEN_DATA model_config, model_path = target_model model = AllenNLPModel('ML', model_config, model_path.resolve()) model.load() labels = model.labels num_classes = len(labels) probabilities = model.probabilities(data) # Ensure the probabilities are probabilites assert probabilities.shape == (2, num_classes) np.testing.assert_almost_equal(probabilities.sum(1), np.array([1, 1])) correct_predictions = ['positive', 'negative'] for sample_index, probability in enumerate(probabilities): prediction_index = labels.index(correct_predictions[sample_index]) best_probability_index = np.argmax(probability) assert prediction_index == best_probability_index assert probability[best_probability_index] > (1/3) @flaky(max_runs=5) @pytest.mark.parametrize("target_model", [(MODEL_TARGET_FP, SAVED_TARGET_MODEL), (MODEL_TDLSTM_FP, SAVED_TDLSTM_MODEL)]) def test_predict(self, target_model): data = self.UNSEEN_DATA model_config, model_path = target_model model = AllenNLPModel('ML', model_config, model_path.resolve()) model.load() labels = model.labels predictions = model.predict(data) correct_predictions = ['positive', 'negative'] correct_predictions_matrix = np.zeros((2,3)) for sample_index, correct_prediction in enumerate(correct_predictions): prediction_index = labels.index(correct_prediction) correct_predictions_matrix[sample_index][prediction_index] = 1 assert np.array_equal(correct_predictions_matrix, predictions) @flaky(max_runs=5) @pytest.mark.parametrize("mapper", [None, {'positive': 1, 'neutral': 0, 'negative': -1}]) @pytest.mark.parametrize("target_model", [(MODEL_TARGET_FP, SAVED_TARGET_MODEL), (MODEL_TDLSTM_FP, SAVED_TDLSTM_MODEL)]) def test_predict_label(self, target_model, mapper): data = self.UNSEEN_DATA model_config, model_path = target_model model = AllenNLPModel('ML', model_config, model_path.resolve()) model.load() predictions = model.predict_label(data, mapper=mapper) correct_predictions = ['positive', 'negative'] if mapper: correct_predictions = [mapper[pred] for pred in correct_predictions] correct_predictions_vector = np.array(correct_predictions) assert np.array_equal(correct_predictions_vector, predictions) @flaky(max_runs=5) @pytest.mark.parametrize("target_model", [(MODEL_TARGET_FP, SAVED_TARGET_MODEL), (MODEL_TDLSTM_FP, SAVED_TDLSTM_MODEL)]) def test_predict_iter(self, target_model): data = self.UNSEEN_DATA model_config, model_path = target_model model = AllenNLPModel('ML', model_config, model_path.resolve()) with pytest.raises(Exception): model.predict(data) model.load() labels = model.labels true_classes = ['positive', 'negative', 'neutral'] num_classes = len(true_classes) assert len(labels) == num_classes predictions = model._predict_iter(data) prediction_keys = ['class_probabilities', 'label'] prediction_results = [] for prediction in predictions: for key, value in prediction.items(): assert key in prediction_keys if key == 'class_probabilities': assert len(value) == num_classes if key == 'label': assert value in true_classes prediction_results.append(prediction) print('----------------') print(len(prediction_results)) print(prediction_results) print(len(data)) print(data) print('----------------') assert len(prediction_results) == len(data) @pytest.mark.parametrize("test_data", (True, False)) def test_fit(self, test_data): true_labels = sorted(['positive', 'negative', 'neutral']) with tempfile.TemporaryDirectory() as temp_dir: temp_save_dir_fp = Path(temp_dir, 'test save dir') model = AllenNLPModel('ML', self.MODEL_TARGET_FP, temp_save_dir_fp) assert not model.fitted assert not model.labels if test_data: model.fit(self.TARGET_TRAIN_DATA, self.TARGET_TRAIN_DATA, self.TARGET_DATA) else: model.fit(self.TARGET_TRAIN_DATA, self.TARGET_TRAIN_DATA) assert model.fitted assert true_labels == sorted(model.labels) assert temp_save_dir_fp.is_dir() token_index = model.model.vocab.get_token_to_index_vocabulary('tokens') if test_data: assert 'Tais' in list(token_index.keys()) else: assert 'Tais' not in list(token_index.keys()) @pytest.mark.parametrize("test_data", (True, False)) def test_load(self, test_data): true_labels = sorted(['positive', 'negative', 'neutral']) # Testing that an error is raised when there is no save directory model = AllenNLPModel('ML', self.MODEL_TARGET_FP) with pytest.raises(Exception): model.load() model.fit(self.TARGET_TRAIN_DATA, self.TARGET_TRAIN_DATA) assert model.fitted with pytest.raises(Exception): model.load() # Testing when the save directory is given with tempfile.TemporaryDirectory() as temp_dir: temp_save_dir_fp = Path(temp_dir, 'test save dir') model = AllenNLPModel('ML', self.MODEL_TARGET_FP, temp_save_dir_fp) with pytest.raises(FileNotFoundError): model.load() with pytest.raises(FileNotFoundError): temp_save_dir_fp.mkdir() model.load() temp_save_dir_fp.rmdir() assert not model.model assert not model.labels if test_data: model.fit(self.TARGET_TRAIN_DATA, self.TARGET_TRAIN_DATA, self.TARGET_DATA) else: model.fit(self.TARGET_TRAIN_DATA, self.TARGET_TRAIN_DATA) assert model.model assert true_labels == sorted(model.labels) archived_model = model.load() token_index = archived_model.vocab.get_token_to_index_vocabulary('tokens') assert len(token_index) > 10 if test_data: assert 'Tais' in list(token_index.keys()) else: assert 'Tais' not in list(token_index.keys()) # Testing when we have a model that is at the save directory without # having to fit the model first. model = AllenNLPModel('ML', self.MODEL_TARGET_FP, self.SAVED_TARGET_MODEL.resolve()) assert not model.model assert not model.labels model.load() assert model.model assert true_labels == sorted(model.labels) @flaky def test_set_random_seeds(self): model_params = Params.from_file(self.MODEL_TARGET_FP.resolve()) seed_keys = ["random_seed", "numpy_seed", "pytorch_seed"] for key in seed_keys: assert key not in model_params AllenNLPModel._set_random_seeds(model_params) seed_values = {} for key in seed_keys: assert key in model_params seed_values[key] = model_params[key] AllenNLPModel._set_random_seeds(model_params) for key in seed_keys: assert seed_values[key] != model_params[key] @pytest.mark.parametrize("lazy", (True, False)) def test_get_vocab(self, lazy): def tokens_labels_exist(dataset_reader, data_paths, test=False): true_labels = sorted(['positive', 'negative', 'neutral']) vocab = AllenNLPModel._get_vocab(dataset_reader, data_paths) tokens = list(vocab.get_token_to_index_vocabulary('tokens').keys()) labels = list(vocab.get_token_to_index_vocabulary('labels').keys()) labels = sorted(labels) assert true_labels == labels assert len(tokens) > 0 if test: assert 'Tais' in tokens assert 'Tha' in tokens else: assert 'Tais' not in tokens assert 'Tha' not in tokens with tempfile.TemporaryDirectory() as temp_data_dir: train_fp = Path(temp_data_dir, 'train data') AllenNLPModel._data_to_json(self.TARGET_TRAIN_DATA, train_fp) val_fp = Path(temp_data_dir, 'val data') AllenNLPModel._data_to_json(self.TARGET_TRAIN_DATA, val_fp) test_fp = Path(temp_data_dir, 'test data') AllenNLPModel._data_to_json(self.TARGET_DATA, test_fp) dataset_reader = TargetDatasetReader(lazy=lazy) tokens_labels_exist(dataset_reader, [train_fp, val_fp], test=False) tokens_labels_exist(dataset_reader, [train_fp, val_fp, test_fp], test=True) def test_preprocess_and_load_param_file(self): fields_to_remove = ['train_data_path', 'validation_data_path', 'test_data_path', 'evaluate_on_test'] model_params = AllenNLPModel._preprocess_and_load_param_file(self.MODEL_TARGET_FP.resolve()) for field in fields_to_remove: assert field not in model_params model_params = Params.from_file(self.MODEL_TARGET_FP.resolve()) for field in fields_to_remove: assert field in model_params def test_add_dataset_paths(self): model_params = AllenNLPModel._preprocess_and_load_param_file(self.MODEL_TARGET_FP.resolve()) train_path = Path(self.test_data_fp, 'train_data') val_path = Path(self.test_data_fp, 'val_data') AllenNLPModel._add_dataset_paths(model_params, train_path, val_path) assert str(train_path.resolve()) == model_params['train_data_path'] assert str(val_path.resolve()) == model_params['validation_data_path'] with pytest.raises(KeyError): model_params['test_data_path'] model_params = AllenNLPModel._preprocess_and_load_param_file(self.MODEL_TARGET_FP.resolve()) test_path = Path(self.test_data_fp, 'test_data') AllenNLPModel._add_dataset_paths(model_params, train_path, val_path, test_path) assert str(train_path.resolve()) == model_params['train_data_path'] assert str(val_path.resolve()) == model_params['validation_data_path'] assert str(test_path.resolve()) == model_params['test_data_path'] @pytest.mark.parametrize("dataset_reader", [TargetDatasetReader(), TDLSTMDatasetReader()]) def test_data_to_json(self, dataset_reader: DatasetReader): model = AllenNLPModel('ML', self.MODEL_TARGET_FP) with tempfile.TemporaryDirectory() as temp_dir: temp_file = Path(temp_dir, 'temp_file.json') model._data_to_json(self.TARGET_DATA, temp_file) # Need to now read it with multiple dataset readers as in TDLSTM and target reader = dataset_reader instances = ensure_list(reader.read((temp_file))) text_1 = ['Tha', 'phone', 'came', 'with', 'a', 'very', 'good', 'battery', 'life', ',', 'however', 'the', 'phone', 'was', 'not', 'very', 'good', '.'] text_2 = ['Tais', 'was', 'an', 'ok', 'camera', 'but', 'the', 'lens', 'could', 'have', 'been', 'better', '.'] if isinstance(dataset_reader, TargetDatasetReader): test_instances = [{"text": text_1, "target": ["battery", "life"], "sentiment": 'positive'}, {"text": text_1, "target": ["phone"], "sentiment": 'negative'}, {"text": text_2, "target": ["camera"], "sentiment": 'neutral'}, {"text": text_2, "target": ["lens"], "sentiment": 'negative'}] elif isinstance(dataset_reader, TDLSTMDatasetReader): left_text_1_1 = text_1[:9] right_text_1_1 = text_1[7:] test_instance_1_1 = {"left_text": left_text_1_1, "right_text": right_text_1_1, "target": ["battery", "life"], "sentiment": 'positive'} left_text_1_2 = text_1[:13] right_text_1_2 = text_1[12:] test_instance_1_2 = {"left_text": left_text_1_2, "right_text": right_text_1_2, "target": ["phone"], "sentiment": 'negative'} left_text_2_1 = text_2[:5] right_text_2_1 = text_2[4:] test_instance_2_1 = {"left_text": left_text_2_1, "right_text": right_text_2_1, "target": ["camera"], "sentiment": 'neutral'} left_text_2_2 = text_2[:8] right_text_2_2 = text_2[7:] test_instance_2_2 = {"left_text": left_text_2_2, "right_text": right_text_2_2, "target": ["lens"], "sentiment": 'negative'} test_instances = [test_instance_1_1, test_instance_1_2, test_instance_2_1, test_instance_2_2] assert len(test_instances) == len(instances) text_keys = ['text', 'target', 'left_text', 'right_text'] for index, instance in enumerate(instances): instance = instance.fields test_instance = test_instances[index] for key, value in test_instance.items(): if key in text_keys: value_other = [token.text for token in instance[key].tokens] elif key == 'sentiment': value_other = instance['label'].label if key == 'right_text': value_other.reverse() assert value == value_other