def test_process(self): path = EnvironmentSettings.root_path + "test/tmp/metadata_filter/" PathBuilder.build(path) dataset = RepertoireDataset(repertoires=RepertoireBuilder.build( [["ACF", "ACF", "ACF"], ["ACF", "ACF"], ["ACF", "ACF", "ACF", "ACF"]], path)[0]) df = pd.DataFrame(data={"key1": [0, 1, 2], "key2": [0, 1, 2]}) df.to_csv(path + "metadata.csv") dataset.metadata_file = path + "metadata.csv" dataset1 = MetadataRepertoireFilter.process( dataset, { "criteria": { "type": OperationType.GREATER_THAN, "value": { "type": DataType.COLUMN, "name": "key2" }, "threshold": 1 }, "result_path": path }) self.assertEqual(1, dataset1.get_example_count()) self.assertRaises( AssertionError, MetadataRepertoireFilter.process, dataset, { "criteria": { "type": OperationType.GREATER_THAN, "value": { "type": DataType.COLUMN, "name": "key2" }, "threshold": 10 }, "result_path": path }) shutil.rmtree(path)
def test_run(self): dataset = RepertoireDataset(repertoires=[ Repertoire("0.npy", "", "0"), Repertoire("0.npy", "", "1"), Repertoire("0.npy", "", "2"), Repertoire("0.npy", "", "3"), Repertoire("0.npy", "", "4"), Repertoire("0.npy", "", "5"), Repertoire("0.npy", "", "6"), Repertoire("0.npy", "", "7") ]) paths = [ EnvironmentSettings.root_path + "test/tmp/datasplitter/split_{}".format(i) for i in range(5) ] for path in paths: PathBuilder.build(path) df = pd.DataFrame(data={ "key1": [0, 0, 1, 1, 1, 2, 2, 0], "filename": [0, 1, 2, 3, 4, 5, 6, 7] }) df.to_csv(EnvironmentSettings.root_path + "test/tmp/datasplitter/metadata.csv") dataset.metadata_file = EnvironmentSettings.root_path + "test/tmp/datasplitter/metadata.csv" training_percentage = 0.7 trains, tests = DataSplitter.run( DataSplitterParams(dataset=dataset, training_percentage=training_percentage, split_strategy=SplitType.RANDOM, split_count=5, paths=paths)) self.assertTrue(isinstance(trains[0], RepertoireDataset)) self.assertTrue(isinstance(tests[0], RepertoireDataset)) self.assertEqual(len(trains[0].get_data()), 5) self.assertEqual(len(tests[0].get_data()), 3) self.assertEqual(5, len(trains)) self.assertEqual(5, len(tests)) self.assertEqual(5, len(trains[0].repertoires)) trains2, tests2 = DataSplitter.run( DataSplitterParams(dataset=dataset, training_percentage=training_percentage, split_strategy=SplitType.RANDOM, split_count=5, paths=paths)) self.assertEqual(trains[0].get_repertoire_ids(), trains2[0].get_repertoire_ids()) paths = [ EnvironmentSettings.root_path + "test/tmp/datasplitter/split_{}".format(i) for i in range(dataset.get_example_count()) ] for path in paths: PathBuilder.build(path) trains, tests = DataSplitter.run( DataSplitterParams(dataset=dataset, split_strategy=SplitType.LOOCV, split_count=-1, training_percentage=-1, paths=paths)) self.assertTrue(isinstance(trains[0], RepertoireDataset)) self.assertTrue(isinstance(tests[0], RepertoireDataset)) self.assertEqual(len(trains[0].get_data()), 7) self.assertEqual(len(tests[0].get_data()), 1) self.assertEqual(8, len(trains)) self.assertEqual(8, len(tests)) paths = [ EnvironmentSettings.root_path + "test/tmp/datasplitter/split_{}".format(i) for i in range(5) ] for path in paths: PathBuilder.build(path) trains, tests = DataSplitter.run( DataSplitterParams(dataset=dataset, split_strategy=SplitType.K_FOLD, split_count=5, training_percentage=-1, paths=paths)) self.assertTrue(isinstance(trains[0], RepertoireDataset)) self.assertTrue(isinstance(tests[0], RepertoireDataset)) self.assertEqual(len(trains[0].get_data()), 6) self.assertEqual(len(tests[0].get_data()), 2) self.assertEqual(5, len(trains)) self.assertEqual(5, len(tests)) shutil.rmtree(EnvironmentSettings.root_path + "test/tmp/datasplitter/")