def import_command(args): project_dir = osp.abspath(args.dst_dir) project_env_dir = osp.join(project_dir, DEFAULT_CONFIG.env_dir) if osp.isdir(project_env_dir) and os.listdir(project_env_dir): if not args.overwrite: raise CliException("Directory '%s' already exists " "(pass --overwrite to overwrite)" % project_env_dir) else: shutil.rmtree(project_env_dir, ignore_errors=True) own_dataset_dir = osp.join(project_dir, DEFAULT_CONFIG.dataset_dir) if osp.isdir(own_dataset_dir) and os.listdir(own_dataset_dir): if not args.overwrite: raise CliException("Directory '%s' already exists " "(pass --overwrite to overwrite)" % own_dataset_dir) else: # NOTE: remove the dir to avoid using data from previous project shutil.rmtree(own_dataset_dir) project_name = args.name if project_name is None: project_name = osp.basename(project_dir) env = Environment() log.info("Importing project from '%s'" % args.source) extra_args = {} if not args.format: if args.extra_args: raise CliException("Extra args can not be used without format") log.info("Trying to detect dataset format...") matches = [] for format_name in env.importers.items: log.debug("Checking '%s' format...", format_name) importer = env.make_importer(format_name) try: match = importer.detect(args.source) if match: log.debug("format matched") matches.append((format_name, importer)) except NotImplementedError: log.debug("Format '%s' does not support auto detection.", format_name) if len(matches) == 0: log.error("Failed to detect dataset format automatically. " "Try to specify format with '-f/--format' parameter.") return 1 elif len(matches) != 1: log.error( "Multiple formats match the dataset: %s. " "Try to specify format with '-f/--format' parameter.", ', '.join(m[0] for m in matches)) return 2 format_name, importer = matches[0] args.format = format_name else: try: importer = env.make_importer(args.format) if hasattr(importer, 'from_cmdline'): extra_args = importer.from_cmdline(args.extra_args) except KeyError: raise CliException("Importer for format '%s' is not found" % \ args.format) log.info("Importing project as '%s'" % args.format) source = osp.abspath(args.source) project = importer(source, **extra_args) project.config.project_name = project_name project.config.project_dir = project_dir if not args.skip_check or args.copy: log.info("Checking the dataset...") dataset = project.make_dataset() if args.copy: log.info("Cloning data...") dataset.save(merge=True, save_images=True) else: project.save() log.info("Project has been created at '%s'" % project_dir) return 0
def build_export_parser(parser_ctor=argparse.ArgumentParser): builtins = sorted(Environment().converters.items) parser = parser_ctor(help="Export project", description=""" Exports the project dataset in some format. Optionally, a filter can be passed, check 'filter' command description for more info. Each dataset format has its own options, which are passed after '--' separator (see examples), pass '-- -h' for more info. If not stated otherwise, by default only annotations are exported, to include images pass '--save-images' parameter.|n |n Formats:|n In Datumaro dataset formats are supported by Converter-s. A Converter produces a dataset of a specific format from dataset items. It is possible to add a custom Converter. To do this, you need to put a Converter definition script to <project_dir>/.datumaro/converters.|n |n List of builtin dataset formats: %s|n |n Examples:|n - Export project as a VOC-like dataset, include images:|n |s|sexport -f voc -- --save-images|n |n - Export project as a COCO-like dataset in other directory:|n |s|sexport -f coco -o path/I/like/ """ % ', '.join(builtins), formatter_class=MultilineFormatter) parser.add_argument('-e', '--filter', default=None, help="Filter expression for dataset items") parser.add_argument('--filter-mode', default=FilterModes.i.name, type=FilterModes.parse, help="Filter mode (options: %s; default: %s)" % \ (', '.join(FilterModes.list_options()) , '%(default)s')) parser.add_argument( '-o', '--output-dir', dest='dst_dir', default=None, help="Directory to save output (default: a subdir in the current one)") parser.add_argument('--overwrite', action='store_true', help="Overwrite existing files in the save directory") parser.add_argument( '-p', '--project', dest='project_dir', default='.', help="Directory of the project to operate on (default: current dir)") parser.add_argument('-f', '--format', required=True, help="Output format") parser.add_argument( 'extra_args', nargs=argparse.REMAINDER, default=None, help="Additional arguments for converter (pass '-- -h' for help)") parser.set_defaults(command=export_command) return parser
def import_command(args): project_dir = osp.abspath(args.dst_dir) project_env_dir = osp.join(project_dir, DEFAULT_CONFIG.env_dir) if osp.isdir(project_env_dir) and os.listdir(project_env_dir): if not args.overwrite: raise CliException("Directory '%s' already exists " "(pass --overwrite to overwrite)" % project_env_dir) else: shutil.rmtree(project_env_dir, ignore_errors=True) own_dataset_dir = osp.join(project_dir, DEFAULT_CONFIG.dataset_dir) if osp.isdir(own_dataset_dir) and os.listdir(own_dataset_dir): if not args.overwrite: raise CliException("Directory '%s' already exists " "(pass --overwrite to overwrite)" % own_dataset_dir) else: # NOTE: remove the dir to avoid using data from previous project shutil.rmtree(own_dataset_dir) project_name = args.name if project_name is None: project_name = osp.basename(project_dir) env = Environment() log.info("Importing project from '%s'" % args.source) extra_args = {} fmt = args.format if not args.format: if args.extra_args: raise CliException("Extra args can not be used without format") log.info("Trying to detect dataset format...") matches = env.detect_dataset(args.source) if len(matches) == 0: log.error("Failed to detect dataset format. " "Try to specify format with '-f/--format' parameter.") return 1 elif len(matches) != 1: log.error( "Multiple formats match the dataset: %s. " "Try to specify format with '-f/--format' parameter.", ', '.join(matches)) return 1 fmt = matches[0] elif args.extra_args: if fmt in env.importers: arg_parser = env.importers[fmt] elif fmt in env.extractors: arg_parser = env.extractors[fmt] else: raise CliException( "Unknown format '%s'. A format can be added" "by providing an Extractor and Importer plugins" % fmt) if hasattr(arg_parser, 'parse_cmdline'): extra_args = arg_parser.parse_cmdline(args.extra_args) else: raise CliException("Format '%s' does not accept " "extra parameters" % fmt) log.info("Importing project as '%s'" % fmt) project = Project.import_from(osp.abspath(args.source), fmt, **extra_args) project.config.project_name = project_name project.config.project_dir = project_dir if not args.skip_check or args.copy: log.info("Checking the dataset...") dataset = project.make_dataset() if args.copy: log.info("Cloning data...") dataset.save(merge=True, save_images=True) else: project.save() log.info("Project has been created at '%s'" % project_dir) return 0
def build_import_parser(parser_ctor=argparse.ArgumentParser): builtins = sorted(Environment().importers.items) parser = parser_ctor(help="Create project from an existing dataset", description=""" Creates a project from an existing dataset. The source can be:|n - a dataset in a supported format (check 'formats' section below)|n - a Datumaro project|n |n Formats:|n Datasets come in a wide variety of formats. Each dataset format defines its own data structure and rules on how to interpret the data. For example, the following data structure is used in COCO format:|n /dataset/|n - /images/<id>.jpg|n - /annotations/|n |n In Datumaro dataset formats are supported by Extractor-s and Importer-s. An Extractor produces a list of dataset items corresponding to the dataset. An Importer creates a project from the data source location. It is possible to add a custom Extractor and Importer. To do this, you need to put an Extractor and Importer implementation scripts to <project_dir>/.datumaro/extractors and <project_dir>/.datumaro/importers.|n |n List of builtin dataset formats: %s|n |n Examples:|n - Create a project from VOC dataset in the current directory:|n |s|simport -f voc -i path/to/voc|n |n - Create a project from COCO dataset in other directory:|n |s|simport -f coco -i path/to/coco -o path/I/like/ """ % ', '.join(builtins), formatter_class=MultilineFormatter) parser.add_argument( '-o', '--output-dir', default='.', dest='dst_dir', help="Directory to save the new project to (default: current dir)") parser.add_argument( '-n', '--name', default=None, help="Name of the new project (default: same as project dir)") parser.add_argument('--copy', action='store_true', help="Copy the dataset instead of saving source links") parser.add_argument('--skip-check', action='store_true', help="Skip source checking") parser.add_argument('--overwrite', action='store_true', help="Overwrite existing files in the save directory") parser.add_argument('-i', '--input-path', required=True, dest='source', help="Path to import project from") parser.add_argument( '-f', '--format', help="Source project format. Will try to detect, if not specified.") parser.add_argument( 'extra_args', nargs=argparse.REMAINDER, help="Additional arguments for importer (pass '-- -h' for help)") parser.set_defaults(command=import_command) return parser
# Copyright (C) 2020 Intel Corporation # # SPDX-License-Identifier: MIT from datumaro.components.project import Environment from cvat.apps.engine.models import DimensionType dm_env = Environment() class _Format: NAME = '' EXT = '' VERSION = '' DISPLAY_NAME = '{NAME} {VERSION}' ENABLED = True class Exporter(_Format): def __call__(self, dst_file, task_data, **options): raise NotImplementedError() class Importer(_Format): def __call__(self, src_file, task_data, **options): raise NotImplementedError() def _wrap_format(f_or_cls, klass, name, version, ext, display_name, enabled, dimension=DimensionType.DIM_2D): import inspect assert inspect.isclass(f_or_cls) or inspect.isfunction(f_or_cls) if inspect.isclass(f_or_cls): assert hasattr(f_or_cls, '__call__')
def convert_command(args): env = Environment() try: converter = env.converters.get(args.output_format) except KeyError: raise CliException("Converter for format '%s' is not found" % \ args.output_format) extra_args = converter.from_cmdline(args.extra_args) def converter_proxy(extractor, save_dir): return converter.convert(extractor, save_dir, **extra_args) filter_args = FilterModes.make_filter_args(args.filter_mode) if not args.input_format: matches = [] for format_name in env.importers.items: log.debug("Checking '%s' format...", format_name) importer = env.make_importer(format_name) try: match = importer.detect(args.source) if match: log.debug("format matched") matches.append((format_name, importer)) except NotImplementedError: log.debug("Format '%s' does not support auto detection.", format_name) if len(matches) == 0: log.error( "Failed to detect dataset format. " "Try to specify format with '-if/--input-format' parameter.") return 1 elif len(matches) != 1: log.error( "Multiple formats match the dataset: %s. " "Try to specify format with '-if/--input-format' parameter.", ', '.join(m[0] for m in matches)) return 2 format_name, importer = matches[0] args.input_format = format_name log.info("Source dataset format detected as '%s'", args.input_format) else: try: importer = env.make_importer(args.input_format) if hasattr(importer, 'from_cmdline'): extra_args = importer.from_cmdline() except KeyError: raise CliException("Importer for format '%s' is not found" % \ args.input_format) source = osp.abspath(args.source) dst_dir = args.dst_dir if dst_dir: if not args.overwrite and osp.isdir(dst_dir) and os.listdir(dst_dir): raise CliException("Directory '%s' already exists " "(pass --overwrite to overwrite)" % dst_dir) else: dst_dir = generate_next_file_name('%s-%s' % \ (osp.basename(source), make_file_name(args.output_format))) dst_dir = osp.abspath(dst_dir) project = importer(source) dataset = project.make_dataset() log.info("Exporting the dataset") dataset.export_project(save_dir=dst_dir, converter=converter_proxy, filter_expr=args.filter, **filter_args) log.info("Dataset exported to '%s' as '%s'" % \ (dst_dir, args.output_format)) return 0
def mergeDataset(self, import_args: Arg, filter_arg: Arg): config = setConfig(import_args['format']) source_datasets = dict([(path, Environment().make_importer( import_args['format'])(str(path)).make_dataset()) for path in self.datasetPathList]) itemIdsAndPath = reduce(lambda x, y: x + y, [[(item.id, path) for item in dataset] for path, dataset in source_datasets.items()]) # for itemId, path in itemIdsAndPath: for path, dataset in source_datasets.items(): itemIdsInPath = set( [itemId for itemId, _path in itemIdsAndPath if _path == path]) itemIdsOutPath = set( [itemId for itemId, _path in itemIdsAndPath if _path != path]) if itemIdsInPath & itemIdsOutPath: for subsetName, subset in dataset.subsets().items(): imgDir: Path = path / config.getImgDir(subsetName) _subset = deepcopy(subset.items) for item in _subset.values(): imgFile = Path(item.image.path) relPath = imgFile.relative_to(imgDir) newPath = imgDir / path.name / relPath oldItemId = item.id newItemId = item.id = str(path.name / relPath.parent / relPath.stem).replace( '\\', '/') item.image._path = str(newPath) del subset.items[oldItemId] subset.items[newItemId] = item newPath.parent.mkdir(parents=True, exist_ok=True) if item.image.has_data: move(str(imgFile), str(imgDir / path.name / relPath)) mergePath = (self.projectsPath / self.mergeFolderName) if mergePath.is_dir(): rmtree(mergePath, onerror=remove_readonly) mergePath.mkdir(exist_ok=True, parents=True) dst_dir = str(mergePath) merger = IntersectMerge(conf=IntersectMerge.Conf()) merged_dataset = merger(list(source_datasets.values())) merged_project = Project() output_dataset = merged_project.make_dataset() output_dataset.define_categories(merged_dataset.categories()) merged_dataset = output_dataset.update(merged_dataset) if filter_arg['no_anno_filter'].lower() == 'y': filtered_dataset = Project().make_dataset() filtered_dataset.define_categories(merged_dataset.categories()) merged_dataset = filtered_dataset.update( merged_dataset.select(lambda item: len(item.annotations) != 0)) annoId = 1 imageIdName = config.imageIdName for idx, item in tqdm(enumerate(merged_dataset), desc='datasets'): if imageIdName is not None: item.attributes[imageIdName] = idx + 1 for anno in item.annotations: anno.id = annoId annoId += 1 merged_dataset.save(save_dir=dst_dir, save_images=True) # for subsetName, subset in tqdm(merged_dataset.subsets().items(), desc='datasets'): # for idx, itemId in tqdm(enumerate(itemIds), desc='items'): # if imageIdName is not None: # merged_dataset.get(itemId,subset=subsetName).attributes[imageIdName] = idx+1 # for anno in merged_dataset.get(itemId, subset=subsetName).annotations: # anno.id = annoId # annoId += 1 # merged_dataset.save(save_dir=dst_dir, save_images=True) return self