def test_fit_predict_dump_load(self): """Fits a model, tests that it predicts correctly, dumps and loads it, then tests that it still predicts""" model = Model(self.pipeline) # Test attempting to predict before fitting with self.assertRaises(RuntimeError): model.predict('Lorem ipsum dolor sit amet.') model.fit(self.dataset, groundtruth_directory=self.groundtruth_2_directory) # Test X and y data are set self.assertTrue(model.X_data) self.assertTrue(model.y_data) # Test that there is at least one prediction resulting_ann = model.predict( 'To exclude the possibility that alterations in PSSD might be a consequence of changes in the volume of reference, we used a subset of the vibratome sections' ) self.assertIsInstance(resulting_ann, Annotations) self.assertTrue(resulting_ann) # Test prediction over directory resulting_dataset = model.predict( self.dataset.data_directory, prediction_directory=self.prediction_directory) self.assertIsInstance(resulting_dataset, Dataset) self.assertEqual(len(self.dataset), len(resulting_dataset)) # Test that groundtruth is written groundtruth_dataset = Dataset(self.groundtruth_2_directory) expected = [d.file_name for d in self.dataset] actual = [d.file_name for d in groundtruth_dataset] self.assertListEqual(expected, actual) # Test that the groundtruth ann files have content for ann in groundtruth_dataset.generate_annotations(): self.assertTrue(ann) # Test pickling a model pickle_path = os.path.join(self.prediction_directory, 'test.pkl') model.dump(pickle_path) new_model = Model(self.pipeline) new_model.load(pickle_path) # Test that there is at least one prediction resulting_ann = new_model.predict( 'To exclude the possibility that alterations in PSSD might be a consequence of changes in the volume of reference, we used a subset of the vibratome sections' ) self.assertIsInstance(resulting_ann, Annotations) self.assertTrue(resulting_ann)
def test_predict(self): """ predict() has different functionality depending on what is passed to it; therefore this test ensures that each type of input is handled correctly """ # Init the Model pipe = TestingPipeline(entities=self.entities) sample_model_path = os.path.join(test_dir, 'sample_models', 'sample_test_pipe.pkl') model = Model(pipe) model.load(sample_model_path) # Test passing a Dataset dataset_output = model.predict(self.dataset) self.assertIsInstance(dataset_output, Dataset) self.assertEqual(len(dataset_output), len(self.dataset)) # Test passing a directory directory_output = model.predict(self.dataset.data_directory) self.assertIsInstance(directory_output, Dataset) self.assertEqual(len(directory_output), len(self.dataset)) # Test passing a string string_output = model.predict('This is a sample string.') self.assertIsInstance(string_output, Annotations) # Test that the predictions are written to the expected location when no path is provided expected_dir = os.path.join(self.dataset.data_directory, 'predictions') self.assertTrue(os.path.isdir(expected_dir)) # Delete that directory shutil.rmtree(expected_dir) # Test predicting to a specific directory model.predict(self.dataset.data_directory, prediction_directory=self.prediction_directory_2) expected_files = os.listdir(self.prediction_directory_2) self.assertEqual(6, len(expected_files))
def test_prediction_with_testing_pipeline(self): """Tests that a model created with the BiLSTM+CRF can be fitted and used to predict""" pipeline = LstmSystematicReviewPipeline( entities=self.entities, word_embeddings=word_embeddings, cuda_device=cuda_device) model = Model(pipeline) model.fit(self.dataset) resulting_dataset = model.predict( self.dataset, prediction_directory=self.prediction_directory) self.assertIsInstance(resulting_dataset, Dataset) # Test that there is at least one prediction if not any(resulting_dataset.generate_annotations()): warn("The model did not generate any predictions")
def test_cross_validate_fit_predict(self): """Tests that a model created with BERT can be fitted and used to predict, with and without the CRF layer""" pipeline = BertPipeline(entities=self.entities, pretrained_model='bert-base-cased', batch_size=self.batch_size, cuda_device=cuda_device) pipeline_crf = BertPipeline(entities=self.entities, pretrained_model='bert-base-cased', batch_size=self.batch_size, cuda_device=cuda_device, using_crf=True) for pipe in [pipeline, pipeline_crf]: model = Model(pipe) model.cross_validate(self.dataset, 2) model.fit(self.dataset) resulting_dataset = model.predict( self.dataset, prediction_directory=self.prediction_directory) self.assertIsInstance(resulting_dataset, Dataset) # Test that there is at least one prediction if not any(resulting_dataset.generate_annotations()): warn("The model did not generate any predictions")
pipeline = SystematicReviewPipeline(entities=entities, use_metamap=True) model = Model(pipeline, n_jobs=1) # number of cores to utilize during feature extraction when training the model. # Note: this is done by forking, not threading hence utlizes a large amount of memory. # Write information about model before training with open(model_directory + "/model_information.txt", 'w') as model_info: model_info.write("Entities: [%s]\n" % ", ".join(entities)) model_info.write("Training Files: %i\n" % len(train_dataset.get_data_files())) model_info.write(model_notes + "\n") model_info.write(str(model)) model.fit(train_dataset) # dump fitted model current_time = datetime.datetime.fromtimestamp( time.time()).strftime('%Y_%m_%d_%H.%M.%S') model.dump(model_directory + "/tac_2018_%s_%s.pkl" % (model_name, current_time)) # predicts over the datasets in evaluation_dataset utilizing the model trained above, then stores those predictions # in a given output directory model.predict(evaluation_dataset, prediction_directory=os.path.join(model_directory, 'predictions')) # performs sequence stratified cross validation over the trained model. # Note that all extracted features are stored in memory while this runs. model.cross_validate(training_dataset=train_dataset)