def download_command(args): env = Environment() if args.dataset_id.startswith('tfds:'): if TFDS_EXTRACTOR_AVAILABLE: tfds_ds_name = args.dataset_id[5:] tfds_ds_metadata = AVAILABLE_TFDS_DATASETS.get(tfds_ds_name) if tfds_ds_metadata: default_converter_name = tfds_ds_metadata.default_converter_name extractor_factory = lambda: make_tfds_extractor(tfds_ds_name) else: raise CliException(f"Unsupported TFDS dataset '{tfds_ds_name}'") else: raise CliException( "TFDS datasets are not available, because TFDS and/or " "TensorFlow are not installed.\n" "You can install them with: pip install datumaro[tf,tfds]") else: raise CliException(f"Unknown dataset ID '{args.dataset_id}'") output_format = args.output_format or default_converter_name try: converter = env.converters[output_format] except KeyError: raise CliException("Converter for format '%s' is not found" % output_format) extra_args = converter.parse_cmdline(args.extra_args) dst_dir = args.dst_dir if dst_dir: if not args.overwrite and osp.isdir(dst_dir) and os.listdir(dst_dir): raise CliException("Directory '%s' already exists " "(pass --overwrite to overwrite)" % dst_dir) else: dst_dir = generate_next_file_name('%s-%s' % ( make_file_name(args.dataset_id), make_file_name(output_format), )) dst_dir = osp.abspath(dst_dir) log.info("Downloading the dataset") extractor = extractor_factory() log.info("Exporting the dataset") converter.convert(extractor, dst_dir, default_image_ext='.png', **extra_args) log.info("Dataset exported to '%s' as '%s'" % (dst_dir, output_format))
def convert_command(args): env = Environment() try: converter = env.converters[args.output_format] except KeyError: raise CliException("Converter for format '%s' is not found" % \ args.output_format) extra_args = converter.parse_cmdline(args.extra_args) filter_args = FilterModes.make_filter_args(args.filter_mode) fmt = args.input_format if not args.input_format: matches = env.detect_dataset(args.source) if len(matches) == 0: log.error( "Failed to detect dataset format. " "Try to specify format with '-if/--input-format' parameter.") return 1 elif len(matches) != 1: log.error( "Multiple formats match the dataset: %s. " "Try to specify format with '-if/--input-format' parameter.", ', '.join(matches)) return 2 fmt = matches[0] log.info("Source dataset format detected as '%s'", args.input_format) source = osp.abspath(args.source) dst_dir = args.dst_dir if dst_dir: if not args.overwrite and osp.isdir(dst_dir) and os.listdir(dst_dir): raise CliException("Directory '%s' already exists " "(pass --overwrite to overwrite)" % dst_dir) else: dst_dir = generate_next_file_name('%s-%s' % \ (osp.basename(source), make_file_name(args.output_format))) dst_dir = osp.abspath(dst_dir) dataset = Dataset.import_from(source, fmt) log.info("Exporting the dataset") if args.filter: dataset = dataset.filter(args.filter, **filter_args) dataset.export(format=args.output_format, save_dir=dst_dir, **extra_args) log.info("Dataset exported to '%s' as '%s'" % \ (dst_dir, args.output_format)) return 0
def normalize_label(label): label = make_file_name(label) # basically, convert to ASCII lowercase label = label.replace('-', '_') return label
def _save_item_annotations( self, item, label_description_writer, bbox_description_writer, mask_description_writer, label_categories, image_meta, ): next_box_id = 0 existing_box_ids = { annotation.attributes['box_id'] for annotation in item.annotations if annotation.type is AnnotationType.mask if 'box_id' in annotation.attributes } for instance in find_instances(item.annotations): instance_box = next( (a for a in instance if a.type is AnnotationType.bbox), None) for annotation in instance: if annotation.type is AnnotationType.label: label_description_writer.writerow({ 'ImageID': item.id, 'LabelName': label_categories[annotation.label].name, 'Confidence': str(annotation.attributes.get('score', 1)), }) elif annotation.type is AnnotationType.bbox: if item.has_image and item.image.size is not None: image_meta[item.id] = item.image.size height, width = item.image.size else: log.warning( "Can't encode box for item '%s' due to missing image file", item.id) continue bbox_description_writer.writerow({ 'ImageID': item.id, 'LabelName': label_categories[annotation.label].name, 'Confidence': str(annotation.attributes.get('score', 1)), 'XMin': annotation.x / width, 'YMin': annotation.y / height, 'XMax': (annotation.x + annotation.w) / width, 'YMax': (annotation.y + annotation.h) / height, **{ bool_attr.oid_name: int(annotation.attributes.get(bool_attr.datumaro_name, -1)) for bool_attr in OpenImagesPath.BBOX_BOOLEAN_ATTRIBUTES }, }) elif annotation.type is AnnotationType.mask: mask_dir = osp.join(self._save_dir, OpenImagesPath.MASKS_DIR, item.subset) box_id_str = annotation.attributes.get('box_id') if box_id_str: if _RE_INVALID_PATH_COMPONENT.fullmatch(box_id_str): raise UnsupportedBoxIdError(item_id=item.id, box_id=box_id_str) else: # find a box ID that isn't used in any other annotations while True: box_id_str = format(next_box_id, "08x") next_box_id += 1 if box_id_str not in existing_box_ids: break label_name = label_categories[annotation.label].name mask_file_name = '%s_%s_%s.png' % ( make_file_name(item.id), make_file_name(label_name), box_id_str, ) box_coords = {} if instance_box is not None: if item.has_image and item.image.size is not None: image_meta[item.id] = item.image.size height, width = item.image.size box_coords = { 'BoxXMin': instance_box.x / width, 'BoxXMax': (instance_box.x + instance_box.w) / width, 'BoxYMin': instance_box.y / height, 'BoxYMax': (instance_box.y + instance_box.h) / height, } else: log.warning( "Can't encode box coordinates for a mask" " for item '%s' due to missing image file", item.id) mask_description_writer.writerow({ 'MaskPath': mask_file_name, 'ImageID': item.id, 'LabelName': label_name, 'BoxID': box_id_str, **box_coords, 'PredictedIoU': annotation.attributes.get('predicted_iou', ''), }) save_image(osp.join(mask_dir, mask_file_name), annotation.image, create_dir=True)
def export_command(args): has_sep = '--' in args._positionals if has_sep: pos = args._positionals.index('--') if 1 < pos: raise argparse.ArgumentError( None, message="Expected no more than 1 target argument") else: pos = 1 args.target = (args._positionals[:pos] or \ [ProjectBuildTargets.MAIN_TARGET])[0] args.extra_args = args._positionals[pos + has_sep:] show_plugin_help = '-h' in args.extra_args or '--help' in args.extra_args project = None try: project = scope_add(load_project(args.project_dir)) except ProjectNotFoundError: if not show_plugin_help: raise if project is not None: env = project.env else: env = Environment() try: converter = env.converters[args.format] except KeyError: raise CliException("Converter for format '%s' is not found" % \ args.format) extra_args = converter.parse_cmdline(args.extra_args) dst_dir = args.dst_dir if dst_dir: if not args.overwrite and osp.isdir(dst_dir) and os.listdir(dst_dir): raise CliException("Directory '%s' already exists " "(pass --overwrite to overwrite)" % dst_dir) else: dst_dir = generate_next_file_name('export-%s' % \ make_file_name(args.format)) dst_dir = osp.abspath(dst_dir) if args.filter: filter_args = FilterModes.make_filter_args(args.filter_mode) filter_expr = args.filter log.info("Loading the project...") dataset = project.working_tree.make_dataset(args.target) if args.filter: dataset.filter(filter_expr, **filter_args) log.info("Exporting...") dataset.export(save_dir=dst_dir, format=converter, **extra_args) log.info("Results have been saved to '%s'" % dst_dir) return 0
def export(dst_format, task_id=None, project_id=None, server_url=None, save_images=False): try: if task_id is not None: db_instance = Task.objects.get(pk=task_id) logger = slogger.task[task_id] cache_ttl = TASK_CACHE_TTL export_fn = task.export_task else: db_instance = Project.objects.get(pk=project_id) logger = slogger.project[project_id] cache_ttl = PROJECT_CACHE_TTL export_fn = project.export_project cache_dir = get_export_cache_dir(db_instance) exporter = EXPORT_FORMATS[dst_format] output_base = '%s_%s' % ('dataset' if save_images else 'annotations', make_file_name(to_snake_case(dst_format))) output_path = '%s.%s' % (output_base, exporter.EXT) output_path = osp.join(cache_dir, output_path) instance_time = timezone.localtime( db_instance.updated_date).timestamp() if isinstance(db_instance, Project): tasks_update = list( map( lambda db_task: timezone.localtime(db_task.updated_date). timestamp(), db_instance.tasks.all())) instance_time = max(tasks_update + [instance_time]) if not (osp.exists(output_path) and \ instance_time <= osp.getmtime(output_path)): os.makedirs(cache_dir, exist_ok=True) with tempfile.TemporaryDirectory(dir=cache_dir) as temp_dir: temp_file = osp.join(temp_dir, 'result') export_fn(db_instance.id, temp_file, dst_format, server_url=server_url, save_images=save_images) os.replace(temp_file, output_path) archive_ctime = osp.getctime(output_path) scheduler = django_rq.get_scheduler() cleaning_job = scheduler.enqueue_in(time_delta=cache_ttl, func=clear_export_cache, task_id=task_id, file_path=output_path, file_ctime=archive_ctime) logger.info( "The {} '{}' is exported as '{}' at '{}' " "and available for downloading for the next {}. " "Export cache cleaning job is enqueued, id '{}'".format( "project" if isinstance(db_instance, Project) else 'task', db_instance.name, dst_format, output_path, cache_ttl, cleaning_job.id)) return output_path except Exception: log_exception(logger) raise