def _upload_bulk_add(self, func_item_to_kv, dataset_id, names, items, progress_cb=None): results = [] if len(names) == 0: return results if len(names) != len(items): raise RuntimeError( "Can not match \"names\" and \"items\" lists, len(names) != len(items)" ) for batch in batched(list(zip(names, items))): images = [] for name, item in batch: item_tuple = func_item_to_kv(item) #@TODO: 'title' -> ApiField.NAME images.append({'title': name, item_tuple[0]: item_tuple[1]}) response = self._api.post('images.bulk.add', { ApiField.DATASET_ID: dataset_id, ApiField.IMAGES: images }) if progress_cb is not None: progress_cb(len(images)) for info_json in response.json(): info_json_copy = info_json.copy() info_json_copy[ApiField.EXT] = info_json[ApiField.MIME].split( '/')[1] results.append( self.InfoType(*[ info_json_copy[field_name] for field_name in self.info_sequence() ])) name_to_res = {img_info.name: img_info for img_info in results} ordered_results = [name_to_res[name] for name in names] return ordered_results
def _download_project(api, project_id, dest_dir, dataset_ids=None, log_progress=False, batch_size=10): dataset_ids = set(dataset_ids) if (dataset_ids is not None) else None project_fs = Project(dest_dir, OpenMode.CREATE) meta = ProjectMeta.from_json(api.project.get_meta(project_id)) project_fs.set_meta(meta) for dataset_info in api.dataset.get_list(project_id): dataset_id = dataset_info.id if dataset_ids is not None and dataset_id not in dataset_ids: continue dataset_fs = project_fs.create_dataset(dataset_info.name) images = api.image.get_list(dataset_id) ds_progress = None if log_progress: ds_progress = Progress('Downloading dataset: {!r}'.format( dataset_info.name), total_cnt=len(images)) for batch in batched(images, batch_size): image_ids = [image_info.id for image_info in batch] image_names = [image_info.name for image_info in batch] # download images in numpy format batch_imgs_bytes = api.image.download_bytes(dataset_id, image_ids) # download annotations in json format ann_infos = api.annotation.download_batch(dataset_id, image_ids) ann_jsons = [ann_info.annotation for ann_info in ann_infos] for name, img_bytes, ann in zip(image_names, batch_imgs_bytes, ann_jsons): dataset_fs.add_item_raw_bytes(name, img_bytes, ann) if log_progress: ds_progress.iters_done_report(len(batch))
def _upload_data_bulk(self, func_item_to_byte_stream, items_hashes, retry_cnt=3, progress_cb=None): """ Upload images (binary data) to server. Works with already existing or duplicating images. :param func_item_to_byte_stream: converter for "item" to byte stream :param items_hashes: iterable of pairs (item, hash) where "item" is a some descriptor (e.g. image file path) for image data, and "hash" is a hash for the image binary data :param retry_cnt: int, number of retries to send the whole set of items :param progress_cb: callback to account progress (in number of items) """ hash_to_items = {i_hash: item for item, i_hash in items_hashes} unique_hashes = set(hash_to_items.keys()) remote_hashes = set(self.check_existing_hashes(list(unique_hashes))) # existing -- from server if progress_cb: progress_cb(len(remote_hashes)) pending_hashes = unique_hashes - remote_hashes # @TODO: some correlation with sly.io.network_exceptions. Should we perform retries here? for retry_idx in range(retry_cnt): # single attempt to upload all data which is not uploaded yet for hashes in batched(list(pending_hashes)): pending_hashes_items = [(h, hash_to_items[h]) for h in hashes] hashes_rcv = self._upload_uniq_images_single_req(func_item_to_byte_stream, pending_hashes_items) pending_hashes -= set(hashes_rcv) if set(hashes_rcv) - set(hashes): logger.warn('Hash inconsistency in images bulk upload.', extra={'sent': hashes, 'received': hashes_rcv}) if progress_cb: progress_cb(len(hashes_rcv)) if not pending_hashes: return logger.warn('Unable to upload images (data).', extra={ 'retry_idx': retry_idx, 'items': [(h, hash_to_items[h]) for h in pending_hashes] }) # now retry it for the case if it is a shadow server/connection error raise RuntimeError("Unable to upload images (data). " "Please check if images are in supported format and if ones aren't corrupted.")
def download_import_files(self, task_id, data_dir): import_struct = self.api.simple_request('GetImportStructure', sly.api_proto.ListFiles, sly.api_proto.Id(id=task_id)) progress = sly.Progress('Downloading', len(import_struct.files), self.logger) def maybe_close_fh(fh, pbar, downloaded_paths: set): if fh is not None: if fh.close_and_check(): pbar.iter_done_report() downloaded_paths.add(fh.file_path) else: self.logger.warning('file was skipped while downloading', extra={'file_path': fh.file_path}) files_to_download = list(import_struct.files) for batch in batched(files_to_download): # Store the file names that have been already downloaded from this batch # to avoid rewriting them on transmission retries if connection issues arise. downloaded_from_batch = set() file_handler = None for chunk in self.api.get_stream_with_data( 'GetImportFiles', sly.api_proto.ChunkFile, sly.api_proto.ImportRequest(task_id=task_id, files=batch)): new_fpath = chunk.file.path if new_fpath: # non-empty maybe_close_fh(file_handler, progress, downloaded_from_batch) real_fpath = os.path.join(data_dir, new_fpath.lstrip('/')) if real_fpath in downloaded_from_batch: file_handler = None else: self.logger.trace('download import file', extra={'file_path': real_fpath}) file_handler = sly.ChunkedFileWriter( file_path=real_fpath) if file_handler is not None: file_handler.write(chunk.chunk) maybe_close_fh(file_handler, progress, downloaded_from_batch)
def _download_batch(self, dataset_id, ids): ''' Generate image id and it content from given dataset and list of images ids :param dataset_id: int :param ids: list of integers ''' for batch_ids in batched(ids): response = self._api.post('images.bulk.download', { ApiField.DATASET_ID: dataset_id, ApiField.IMAGE_IDS: batch_ids }) decoder = MultipartDecoder.from_response(response) for part in decoder.parts: content_utf8 = part.headers[b'Content-Disposition'].decode( 'utf-8') # Find name="1245" preceded by a whitespace, semicolon or beginning of line. # The regex has 2 capture group: one for the prefix and one for the actual name value. img_id = int( re.findall(r'(^|[\s;])name="(\d*)"', content_utf8)[0][1]) yield img_id, part
def _upload_bulk_add(self, func_item_to_kv, dataset_id, names, items, metas=None, progress_cb=None): if metas is None: metas = [{}] * len(items) results = [] if len(names) == 0: return results if len(names) != len(items): raise RuntimeError( "Can not match \"names\" and \"items\" lists, len(names) != len(items)" ) for batch in batched(list(zip(names, items, metas))): images = [] for name, item, meta in batch: item_tuple = func_item_to_kv(item) images.append({ ApiField.NAME: name, item_tuple[0]: item_tuple[1], ApiField.META: meta if meta is not None else {} }) response = self._api.post('point-clouds.bulk.add', { ApiField.DATASET_ID: dataset_id, ApiField.POINTCLOUDS: images }) if progress_cb is not None: progress_cb(len(images)) results = [ self._convert_json_info(item) for item in response.json() ] name_to_res = {img_info.name: img_info for img_info in results} ordered_results = [name_to_res[name] for name in names] return ordered_results
def _upload_batch(self, func_ann_to_json, img_ids, anns, progress_cb=None): # img_ids from the same dataset if len(img_ids) == 0: return if len(img_ids) != len(anns): raise RuntimeError( 'Can not match "img_ids" and "anns" lists, len(img_ids) != len(anns)' ) dataset_id = self._api.image.get_info_by_id(img_ids[0]).dataset_id for batch in batched(list(zip(img_ids, anns))): data = [{ ApiField.IMAGE_ID: img_id, ApiField.ANNOTATION: func_ann_to_json(ann) } for img_id, ann in batch] self._api.post('annotations.bulk.add', data={ ApiField.DATASET_ID: dataset_id, ApiField.ANNOTATIONS: data }) if progress_cb is not None: progress_cb(len(batch))
def upload_batch_paths(self, dataset_id, img_ids, ann_paths, progress_cb=None): MAX_BATCH_SIZE = 50 for batch in batched(list(zip(img_ids, ann_paths)), MAX_BATCH_SIZE): data = [] for img_id, ann_path in batch: with open(ann_path) as json_file: ann_json = json.load(json_file) data.append({ ApiField.IMAGE_ID: img_id, ApiField.ANNOTATION: ann_json }) self.api.post('annotations.bulk.add', data={ ApiField.DATASET_ID: dataset_id, ApiField.ANNOTATIONS: data }) if progress_cb is not None: progress_cb(len(batch))
def get_info_by_id_batch(self, ids): ''' :param ids: list of integers :return: list of images metadata ''' results = [] if len(ids) == 0: return results dataset_id = self.get_info_by_id(ids[0]).dataset_id for batch in batched(ids): filters = [{ "field": ApiField.ID, "operator": "in", "value": batch }] results.extend( self.get_list_all_pages('images.list', { ApiField.DATASET_ID: dataset_id, ApiField.FILTER: filters })) temp_map = {info.id: info for info in results} ordered_results = [temp_map[id] for id in ids] return ordered_results
def _upload_bulk_add(self, func_item_to_kv, dataset_id, names, items, progress_cb=None): results = [] if len(names) == 0: return results if len(names) != len(items): raise RuntimeError( "Can not match \"names\" and \"items\" lists, len(names) != len(items)" ) for batch in batched(list(zip(names, items))): images = [] for name, item in batch: item_tuple = func_item_to_kv(item) #@TODO: 'title' -> ApiField.NAME images.append({'title': name, item_tuple[0]: item_tuple[1]}) response = self.api.post('images.bulk.add', { ApiField.DATASET_ID: dataset_id, ApiField.IMAGES: images }) if progress_cb is not None: progress_cb(len(images)) results.extend([ self._convert_json_info(info_json) for info_json in response.json() ]) name_to_res = {img_info.name: img_info for img_info in results} ordered_results = [name_to_res[name] for name in names] return ordered_results
def _upload_data_bulk(self, func_item_to_byte_stream, func_item_hash, items, progress_cb): hashes = [] if len(items) == 0: return hashes hash_to_items = defaultdict(list) for idx, item in enumerate(items): item_hash = func_item_hash(item) hashes.append(item_hash) hash_to_items[item_hash].append(item) unique_hashes = set(hashes) remote_hashes = self.check_existing_hashes(list(unique_hashes)) new_hashes = unique_hashes - set(remote_hashes) if progress_cb is not None: progress_cb(len(remote_hashes)) # upload only new images to supervisely server items_to_upload = [] for hash in new_hashes: items_to_upload.extend(hash_to_items[hash]) for batch in batched(items_to_upload): content_dict = {} for idx, item in enumerate(batch): content_dict["{}-file".format(idx)] = ( str(idx), func_item_to_byte_stream(item), 'image/*') encoder = MultipartEncoder(fields=content_dict) self.api.post('images.bulk.upload', encoder) if progress_cb is not None: progress_cb(len(batch)) return hashes
def remove_batch(self, ids, progress_cb=None): for ids_batch in batched(ids): self._api.post(self._remove_batch_api_method_name(), {self._remove_batch_field_name(): ids_batch}) if progress_cb is not None: progress_cb(len(ids_batch))
def download_video_project(api, project_id, dest_dir, dataset_ids=None, download_videos=True, log_progress=False): ''' Download project with given id in destination directory :param api: Api class object :param project_id: int :param dest_dir: str :param dataset_ids: list of integers :param download_videos: bool :param log_progress: bool ''' LOG_BATCH_SIZE = 1 key_id_map = KeyIdMap() project_fs = VideoProject(dest_dir, OpenMode.CREATE) meta = ProjectMeta.from_json(api.project.get_meta(project_id)) project_fs.set_meta(meta) datasets_infos = [] if dataset_ids is not None: for ds_id in dataset_ids: datasets_infos.append(api.dataset.get_info_by_id(ds_id)) else: datasets_infos = api.dataset.get_list(project_id) for dataset in datasets_infos: dataset_fs = project_fs.create_dataset(dataset.name) videos = api.video.get_list(dataset.id) ds_progress = None if log_progress: ds_progress = Progress('Downloading dataset: {!r}'.format( dataset.name), total_cnt=len(videos)) for batch in batched(videos, batch_size=LOG_BATCH_SIZE): video_ids = [video_info.id for video_info in batch] video_names = [video_info.name for video_info in batch] ann_jsons = api.video.annotation.download_bulk( dataset.id, video_ids) for video_id, video_name, ann_json in zip(video_ids, video_names, ann_jsons): if video_name != ann_json[ApiField.VIDEO_NAME]: raise RuntimeError( "Error in api.video.annotation.download_batch: broken order" ) video_file_path = dataset_fs.generate_item_path(video_name) if download_videos is True: api.video.download_path(video_id, video_file_path) else: touch(video_file_path) dataset_fs.add_item_file(video_name, video_file_path, ann=VideoAnnotation.from_json( ann_json, project_fs.meta, key_id_map), _validate_item=False) ds_progress.iters_done_report(len(batch)) project_fs.set_key_id_map(key_id_map)
def _download_dataset(api: Api, dataset, dataset_id, cache=None, progress_cb=None): images = api.image.get_list(dataset_id) images_to_download = images # copy images from cache to task folder and download corresponding annotations if cache: images_to_download, images_in_cache, images_cache_paths = _split_images_by_cache( images, cache) if len(images_to_download) + len(images_in_cache) != len(images): raise RuntimeError( "Error with images cache during download. Please contact support." ) logger.info(f"Download dataset: {dataset.name}", extra={ "total": len(images), "in cache": len(images_in_cache), "to download": len(images_to_download) }) if len(images_in_cache) > 0: img_cache_ids = [img_info.id for img_info in images_in_cache] ann_info_list = api.annotation.download_batch( dataset_id, img_cache_ids, progress_cb) img_name_to_ann = { ann.image_id: ann.annotation for ann in ann_info_list } for batch in batched(list(zip(images_in_cache, images_cache_paths)), batch_size=50): for img_info, img_cache_path in batch: item_name = _maybe_append_image_extension( img_info.name, img_info.ext) dataset.add_item_file(item_name, img_cache_path, img_name_to_ann[img_info.id], _validate_item=False, _use_hardlink=True) progress_cb(len(batch)) # download images from server if len(images_to_download) > 0: # prepare lists for api methods img_ids = [] img_paths = [] for img_info in images_to_download: img_ids.append(img_info.id) # TODO download to a temp file and use dataset api to add the image to the dataset. img_paths.append( os.path.join( dataset.img_dir, _maybe_append_image_extension(img_info.name, img_info.ext))) # download annotations ann_info_list = api.annotation.download_batch(dataset_id, img_ids, progress_cb) img_name_to_ann = { ann.image_id: ann.annotation for ann in ann_info_list } api.image.download_paths(dataset_id, img_ids, img_paths, progress_cb) for img_info, img_path in zip(images_to_download, img_paths): dataset.add_item_file(img_info.name, img_path, img_name_to_ann[img_info.id]) if cache: img_hashes = [img_info.hash for img_info in images_to_download] cache.write_objects(img_paths, img_hashes)
def download_pointcloud_project(api, project_id, dest_dir, dataset_ids=None, download_items=True, log_progress=False): LOG_BATCH_SIZE = 1 key_id_map = KeyIdMap() project_fs = PointcloudProject(dest_dir, OpenMode.CREATE) meta = ProjectMeta.from_json(api.project.get_meta(project_id)) project_fs.set_meta(meta) datasets_infos = [] if dataset_ids is not None: for ds_id in dataset_ids: datasets_infos.append(api.dataset.get_info_by_id(ds_id)) else: datasets_infos = api.dataset.get_list(project_id) for dataset in datasets_infos: dataset_fs = project_fs.create_dataset(dataset.name) pointclouds = api.pointcloud.get_list(dataset.id) ds_progress = None if log_progress: ds_progress = Progress('Downloading dataset: {!r}'.format(dataset.name), total_cnt=len(pointclouds)) for batch in batched(pointclouds, batch_size=LOG_BATCH_SIZE): pointcloud_ids = [pointcloud_info.id for pointcloud_info in batch] pointcloud_names = [pointcloud_info.name for pointcloud_info in batch] ann_jsons = api.pointcloud.annotation.download_bulk(dataset.id, pointcloud_ids) for pointcloud_id, pointcloud_name, ann_json in zip(pointcloud_ids, pointcloud_names, ann_jsons): if pointcloud_name != ann_json[ApiField.NAME]: raise RuntimeError("Error in api.video.annotation.download_batch: broken order") pointcloud_file_path = dataset_fs.generate_item_path(pointcloud_name) if download_items is True: api.pointcloud.download_path(pointcloud_id, pointcloud_file_path) related_images_path = dataset_fs.get_related_images_path(pointcloud_name) related_images = api.pointcloud.get_list_related_images(pointcloud_id) for rimage_info in related_images: name = rimage_info[ApiField.NAME] rimage_id = rimage_info[ApiField.ID] path_img = os.path.join(related_images_path, name) path_json = os.path.join(related_images_path, name + ".json") api.pointcloud.download_related_image(rimage_id, path_img) dump_json_file(rimage_info, path_json) else: touch(pointcloud_file_path) dataset_fs.add_item_file(pointcloud_name, pointcloud_file_path, ann=PointcloudAnnotation.from_json(ann_json, project_fs.meta, key_id_map), _validate_item=False) ds_progress.iters_done_report(len(batch)) project_fs.set_key_id_map(key_id_map)