def test_by_regex(): test = FileGroup(recursive=True) test.load_dir("tests/data/image", File, recursive=True) test.configure_pipeline(["FilterByRegex"]) test.pipeline.components[0].configure(pattern="sample.jpg") original_count = len(test.files) test.run_pipeline() assert len(test.files) == original_count - 1
def test_subsample(): test = FileGroup(recursive=True) test.load_dir("tests/data/image", File, recursive=True) test.configure_pipeline(["FilterSubsample"]) assert len(test.files) > 2 test.pipeline.components[0].configure(N=2) test.run_pipeline() assert len(test.files) == 2
def test_by_label(): test = FileGroup(recursive=True) test.load_dir("tests/data/image", File, recursive=True) test.configure_pipeline(["FilterByLabel"]) test.pipeline.components[0].configure(label_to_filter="TRAIN") test.files[0].add_label("TRAIN") original_count = len(test.files) test.run_pipeline() assert len(test.files) == original_count - 1
def test_validation_and_test(): test = FileGroup(recursive=True) test.load_dir("tests/data", File, recursive=True) test.configure_pipeline(["LabelerValidationAndTest"]) test.pipeline.components[0].configure(val_frac=0.2, test_frac=0.2) test.run_pipeline() found_counts = [0, 0, 0] for fobj in test.files: if fobj.has_label("TRAIN"): found_counts[0] += 1 elif fobj.has_label("VALIDATE"): found_counts[1] += 1 elif fobj.has_label("TEST"): found_counts[2] += 1 assert found_counts[0] == len( test.files) - 2 * round(len(test.files) * 0.2) assert found_counts[1] == round(len(test.files) * 0.2) assert found_counts[2] == round(len(test.files) * 0.2)
def test_duplicates(): test = FileGroup(recursive=True) test.load_dir("tests/data/image", File, recursive=True) test.configure_pipeline(["FilterDuplicateFiles"]) test.run_pipeline() assert len(test.filtered["duplicates"]) > 0