def test_cant_merge_different_categories(self): class TestExtractor1(Extractor): def __iter__(self): return iter([]) def categories(self): return { AnnotationType.label: LabelCategories.from_iterable(['a', 'b']) } class TestExtractor2(Extractor): def __iter__(self): return iter([]) def categories(self): return { AnnotationType.label: LabelCategories.from_iterable(['b', 'a']) } e_name1 = 'e1' e_name2 = 'e2' project = Project() project.env.extractors.register(e_name1, TestExtractor1) project.env.extractors.register(e_name2, TestExtractor2) project.add_source('source1', {'format': e_name1}) project.add_source('source2', {'format': e_name2}) with self.assertRaisesRegex(Exception, "different categories"): project.make_dataset()
def test_can_do_transform_with_custom_model(self): class TestExtractorSrc(Extractor): def __iter__(self): for i in range(2): yield DatasetItem(id=i, image=np.ones([2, 2, 3]) * i, annotations=[Label(i)]) class TestLauncher(Launcher): def launch(self, inputs): for inp in inputs: yield [Label(inp[0, 0, 0])] class TestConverter(Converter): def __call__(self, extractor, save_dir): for item in extractor: with open(osp.join(save_dir, '%s.txt' % item.id), 'w') as f: f.write(str(item.annotations[0].label) + '\n') class TestExtractorDst(Extractor): def __init__(self, url): super().__init__() self.items = [ osp.join(url, p) for p in sorted(os.listdir(url)) ] def __iter__(self): for path in self.items: with open(path, 'r') as f: index = osp.splitext(osp.basename(path))[0] label = int(f.readline().strip()) yield DatasetItem(id=index, annotations=[Label(label)]) model_name = 'model' launcher_name = 'custom_launcher' extractor_name = 'custom_extractor' project = Project() project.env.launchers.register(launcher_name, TestLauncher) project.env.extractors.register(extractor_name, TestExtractorSrc) project.env.converters.register(extractor_name, TestConverter) project.add_model(model_name, {'launcher': launcher_name}) project.add_source('source', {'format': extractor_name}) with TestDir() as test_dir: project.make_dataset().apply_model(model=model_name, save_dir=test_dir) result = Project.load(test_dir) result.env.extractors.register(extractor_name, TestExtractorDst) it = iter(result.make_dataset()) item1 = next(it) item2 = next(it) self.assertEqual(0, item1.annotations[0].label) self.assertEqual(1, item2.annotations[0].label)
def test_custom_extractor_can_be_created(self): class CustomExtractor(Extractor): def __init__(self, url): super().__init__() def __iter__(self): return iter([ DatasetItem(id=0, subset='train'), DatasetItem(id=1, subset='train'), DatasetItem(id=2, subset='train'), DatasetItem(id=3, subset='test'), ]) def subsets(self): return ['train', 'test'] extractor_name = 'ext1' project = Project() project.env.extractors.register(extractor_name, CustomExtractor) project.add_source('src1', { 'url': 'path', 'format': extractor_name, }) project.set_subsets(['train']) dataset = project.make_dataset() self.assertEqual(3, len(dataset))
def test_project_can_merge_item_annotations(self): class TestExtractor1(Extractor): def __iter__(self): yield DatasetItem(id=1, subset='train', annotations=[ Label(2, id=3), Label(3, attributes={'x': 1}), ]) class TestExtractor2(Extractor): def __iter__(self): yield DatasetItem(id=1, subset='train', annotations=[ Label(3, attributes={'x': 1}), Label(4, id=4), ]) project = Project() project.env.extractors.register('t1', TestExtractor1) project.env.extractors.register('t2', TestExtractor2) project.add_source('source1', {'format': 't1'}) project.add_source('source2', {'format': 't2'}) merged = project.make_dataset() self.assertEqual(1, len(merged)) item = next(iter(merged)) self.assertEqual(3, len(item.annotations))
def test_project_compound_child_can_be_modified_recursively(self): with TestDir() as test_dir: child1 = Project({ 'project_dir': osp.join(test_dir, 'child1'), }) child1.save() child2 = Project({ 'project_dir': osp.join(test_dir, 'child2'), }) child2.save() parent = Project() parent.add_source('child1', {'url': child1.config.project_dir}) parent.add_source('child2', {'url': child2.config.project_dir}) dataset = parent.make_dataset() item1 = DatasetItem(id='ch1', path=['child1']) item2 = DatasetItem(id='ch2', path=['child2']) dataset.put(item1) dataset.put(item2) self.assertEqual(2, len(dataset)) self.assertEqual(1, len(dataset.sources['child1'])) self.assertEqual(1, len(dataset.sources['child2']))
def test_source_datasets_can_be_merged(self): class TestExtractor(Extractor): def __init__(self, url, n=0, s=0): super().__init__(length=n) self.n = n self.s = s def __iter__(self): for i in range(self.n): yield DatasetItem(id=self.s + i, subset='train') e_name1 = 'e1' e_name2 = 'e2' n1 = 2 n2 = 4 project = Project() project.env.extractors.register(e_name1, lambda p: TestExtractor(p, n=n1)) project.env.extractors.register(e_name2, lambda p: TestExtractor(p, n=n2, s=n1)) project.add_source('source1', {'format': e_name1}) project.add_source('source2', {'format': e_name2}) dataset = project.make_dataset() self.assertEqual(n1 + n2, len(dataset))
def test_custom_extractor_can_be_created(self): class CustomExtractor(Extractor): def __iter__(self): return iter([ DatasetItem(id=0, subset='train'), DatasetItem(id=1, subset='train'), DatasetItem(id=2, subset='train'), DatasetItem(id=3, subset='test'), DatasetItem(id=4, subset='test'), DatasetItem(id=1), DatasetItem(id=2), DatasetItem(id=3), ]) extractor_name = 'ext1' project = Project() project.env.extractors.register(extractor_name, CustomExtractor) project.add_source('src1', { 'url': 'path', 'format': extractor_name, }) dataset = project.make_dataset() compare_datasets(self, CustomExtractor(), dataset)
def test_project_own_dataset_can_be_modified(self): project = Project() dataset = project.make_dataset() item = DatasetItem(id=1) dataset.put(item) self.assertEqual(item, next(iter(dataset)))
def test_can_have_project_source(self): with TestDir() as test_dir: Project.generate(test_dir) project2 = Project() project2.add_source('project1', { 'url': test_dir, }) dataset = project2.make_dataset() self.assertTrue('project1' in dataset.sources)
def test_can_save_and_load_own_dataset(self): with TestDir() as test_dir: src_project = Project() src_dataset = src_project.make_dataset() item = DatasetItem(id=1) src_dataset.put(item) src_dataset.save(test_dir) loaded_project = Project.load(test_dir) loaded_dataset = loaded_project.make_dataset() self.assertEqual(list(src_dataset), list(loaded_dataset))
def test_project_filter_can_be_applied(self): class TestExtractor(Extractor): def __iter__(self): for i in range(10): yield DatasetItem(id=i, subset='train') e_type = 'type' project = Project() project.env.extractors.register(e_type, TestExtractor) project.add_source('source', {'format': e_type}) dataset = project.make_dataset().extract('/item[id < 5]') self.assertEqual(5, len(dataset))
def check_data(json_path): # create Datumaro project project = Project() # add source project.add_source('src1', { 'url': str(json_path), 'format': 'coco_instances' }) # create a dataset dataset = project.make_dataset() print(f'{json_path.stem}') print(f'num images: {num_img(dataset)}') print(f'num images with annotations: {num_img_with_annots(dataset)}') print(f'num annotations: {num_annots(dataset)}')
def test_project_can_merge_item_annotations(self): class TestExtractor(Extractor): def __init__(self, url, v=None): super().__init__() self.v = v def __iter__(self): v1_item = DatasetItem(id=1, subset='train', annotations=[ LabelObject(2, id=3), LabelObject(3, attributes={'x': 1}), ]) v2_item = DatasetItem(id=1, subset='train', annotations=[ LabelObject(3, attributes={'x': 1}), LabelObject(4, id=4), ]) if self.v == 1: yield v1_item else: yield v2_item def subsets(self): return ['train'] project = Project() project.env.extractors.register('t1', lambda p: TestExtractor(p, v=1)) project.env.extractors.register('t2', lambda p: TestExtractor(p, v=2)) project.add_source('source1', {'format': 't1'}) project.add_source('source2', {'format': 't2'}) merged = project.make_dataset() self.assertEqual(1, len(merged)) item = next(iter(merged)) self.assertEqual(3, len(item.annotations))
def test_project_filter_can_be_applied(self): class TestExtractor(Extractor): def __init__(self, url, n=10): super().__init__(length=n) self.n = n def __iter__(self): for i in range(self.n): yield DatasetItem(id=i, subset='train') def subsets(self): return ['train'] e_type = 'type' project = Project() project.env.extractors.register(e_type, TestExtractor) project.add_source('source', {'format': e_type}) project.set_filter('/item[id < 5]') dataset = project.make_dataset() self.assertEqual(5, len(dataset))
def merge(cleaned_datasets, output, save_images=False): """datum merge -o {output} {project_dirs}""" print(f"Merging datasets to {output}/") projects = [Project.load(p) for p in cleaned_datasets] datasets = [p.make_dataset() for p in projects] merged_project_dir = Path(output) # perform the merge merge_config = IntersectMerge.Conf( pairwise_dist=0.25, groups=[], output_conf_thresh=0.0, quorum=0, ) merged_dataset = IntersectMerge(conf=merge_config)(datasets) merged_project = Project() output_dataset = merged_project.make_dataset() output_dataset.define_categories(merged_dataset.categories()) merged_dataset = output_dataset.update(merged_dataset) merged_dataset.save(save_dir=merged_project_dir, save_images=save_images)
def merge_command(args): source_projects = [load_project(p) for p in args.project] dst_dir = args.dst_dir if dst_dir: if not args.overwrite and osp.isdir(dst_dir) and os.listdir(dst_dir): raise CliException("Directory '%s' already exists " "(pass --overwrite to overwrite)" % dst_dir) else: dst_dir = generate_next_file_name('merged') source_datasets = [] for p in source_projects: log.debug("Loading project '%s' dataset", p.config.project_name) source_datasets.append(p.make_dataset()) merger = IntersectMerge( conf=IntersectMerge.Conf(pairwise_dist=args.iou_thresh, groups=args.groups, output_conf_thresh=args.output_conf_thresh, quorum=args.quorum)) merged_dataset = merger(source_datasets) merged_project = Project() output_dataset = merged_project.make_dataset() output_dataset.define_categories(merged_dataset.categories()) merged_dataset = output_dataset.update(merged_dataset) merged_dataset.save(save_dir=dst_dir) report_path = osp.join(dst_dir, 'merge_report.json') save_merge_report(merger, report_path) dst_dir = osp.abspath(dst_dir) log.info("Merge results have been saved to '%s'" % dst_dir) log.info("Report has been saved to '%s'" % report_path) return 0
def test_can_do_transform_with_custom_model(self): class TestExtractorSrc(Extractor): def __init__(self, url, n=2): super().__init__(length=n) self.n = n def __iter__(self): for i in range(self.n): yield DatasetItem(id=i, subset='train', image=i, annotations=[LabelObject(i)]) def subsets(self): return ['train'] class TestLauncher(Launcher): def __init__(self, **kwargs): pass def launch(self, inputs): for inp in inputs: yield [LabelObject(inp)] class TestConverter(Converter): def __call__(self, extractor, save_dir): for item in extractor: with open(osp.join(save_dir, '%s.txt' % item.id), 'w+') as f: f.write(str(item.subset) + '\n') f.write(str(item.annotations[0].label) + '\n') class TestExtractorDst(Extractor): def __init__(self, url): super().__init__() self.items = [ osp.join(url, p) for p in sorted(os.listdir(url)) ] def __iter__(self): for path in self.items: with open(path, 'r') as f: index = osp.splitext(osp.basename(path))[0] subset = f.readline()[:-1] label = int(f.readline()[:-1]) assert (subset == 'train') yield DatasetItem(id=index, subset=subset, annotations=[LabelObject(label)]) def __len__(self): return len(self.items) def subsets(self): return ['train'] model_name = 'model' launcher_name = 'custom_launcher' extractor_name = 'custom_extractor' project = Project() project.env.launchers.register(launcher_name, TestLauncher) project.env.extractors.register(extractor_name, TestExtractorSrc) project.env.converters.register(extractor_name, TestConverter) project.add_model(model_name, {'launcher': launcher_name}) project.add_source('source', {'format': extractor_name}) with TestDir() as test_dir: project.make_dataset().transform(model_name, test_dir.path) result = Project.load(test_dir.path) result.env.extractors.register(extractor_name, TestExtractorDst) it = iter(result.make_dataset()) item1 = next(it) item2 = next(it) self.assertEqual(0, item1.annotations[0].label) self.assertEqual(1, item2.annotations[0].label)
group.add_argument('--json_paths', nargs="+", help='json paths separated by whitespace') group.add_argument('--annots_folder', help='path of annotation folder containing multiple jsons') ap.add_argument('--output_json', help='path of output json', required=True) args = ap.parse_args() # create Datumaro project project = Project() # add sources if args.json_paths: for i, json_path in enumerate(args.json_paths): new_json_path = check_json_path(json_path) project.add_source(f'src{i}', {'url': str(new_json_path), 'format': 'coco_instances'}) elif args.annots_folder: # doesnt recursively search in subfolders for i, json_path in enumerate(Path(args.annots_folder).iterdir()): if json_path.suffix == '.json': new_json_path = check_json_path(json_path) project.add_source(f'src{i}', {'url': str(new_json_path), 'format': 'coco_instances'}) # create a dataset dataset = project.make_dataset() # print some stats print(f'num images: {num_img(dataset)}') print(f'num images with annotations: {num_img_with_annots(dataset)}') print(f'num annotations: {num_annots(dataset)}') # export the resulting json in COCO format export_json(dataset, args.output_json)
def mergeDataset(self, import_args: Arg, filter_arg: Arg): config = setConfig(import_args['format']) source_datasets = dict([(path, Environment().make_importer( import_args['format'])(str(path)).make_dataset()) for path in self.datasetPathList]) itemIdsAndPath = reduce(lambda x, y: x + y, [[(item.id, path) for item in dataset] for path, dataset in source_datasets.items()]) # for itemId, path in itemIdsAndPath: for path, dataset in source_datasets.items(): itemIdsInPath = set( [itemId for itemId, _path in itemIdsAndPath if _path == path]) itemIdsOutPath = set( [itemId for itemId, _path in itemIdsAndPath if _path != path]) if itemIdsInPath & itemIdsOutPath: for subsetName, subset in dataset.subsets().items(): imgDir: Path = path / config.getImgDir(subsetName) _subset = deepcopy(subset.items) for item in _subset.values(): imgFile = Path(item.image.path) relPath = imgFile.relative_to(imgDir) newPath = imgDir / path.name / relPath oldItemId = item.id newItemId = item.id = str(path.name / relPath.parent / relPath.stem).replace( '\\', '/') item.image._path = str(newPath) del subset.items[oldItemId] subset.items[newItemId] = item newPath.parent.mkdir(parents=True, exist_ok=True) if item.image.has_data: move(str(imgFile), str(imgDir / path.name / relPath)) mergePath = (self.projectsPath / self.mergeFolderName) if mergePath.is_dir(): rmtree(mergePath, onerror=remove_readonly) mergePath.mkdir(exist_ok=True, parents=True) dst_dir = str(mergePath) merger = IntersectMerge(conf=IntersectMerge.Conf()) merged_dataset = merger(list(source_datasets.values())) merged_project = Project() output_dataset = merged_project.make_dataset() output_dataset.define_categories(merged_dataset.categories()) merged_dataset = output_dataset.update(merged_dataset) if filter_arg['no_anno_filter'].lower() == 'y': filtered_dataset = Project().make_dataset() filtered_dataset.define_categories(merged_dataset.categories()) merged_dataset = filtered_dataset.update( merged_dataset.select(lambda item: len(item.annotations) != 0)) annoId = 1 imageIdName = config.imageIdName for idx, item in tqdm(enumerate(merged_dataset), desc='datasets'): if imageIdName is not None: item.attributes[imageIdName] = idx + 1 for anno in item.annotations: anno.id = annoId annoId += 1 merged_dataset.save(save_dir=dst_dir, save_images=True) # for subsetName, subset in tqdm(merged_dataset.subsets().items(), desc='datasets'): # for idx, itemId in tqdm(enumerate(itemIds), desc='items'): # if imageIdName is not None: # merged_dataset.get(itemId,subset=subsetName).attributes[imageIdName] = idx+1 # for anno in merged_dataset.get(itemId, subset=subsetName).annotations: # anno.id = annoId # annoId += 1 # merged_dataset.save(save_dir=dst_dir, save_images=True) return self