def upload_files(self, task_id, abs_paths, names, progress_cb=None): if len(abs_paths) != len(names): raise RuntimeError("Inconsistency: len(abs_paths) != len(names)") hashes = [] if len(abs_paths) == 0: return hash_to_items = defaultdict(list) hash_to_name = defaultdict(list) for idx, item in enumerate(zip(abs_paths, names)): path, name = item item_hash = get_file_hash(path) hashes.append(item_hash) hash_to_items[item_hash].append(path) hash_to_name[item_hash].append(name) unique_hashes = set(hashes) remote_hashes = self._api.image.check_existing_hashes( list(unique_hashes)) new_hashes = unique_hashes - set(remote_hashes) # @TODO: upload remote hashes if len(remote_hashes) != 0: files = [] for hash in remote_hashes: for name in hash_to_name[hash]: files.append({ApiField.NAME: name, ApiField.HASH: hash}) for batch in batched(files): resp = self._api.post('tasks.files.bulk.add-by-hash', { ApiField.TASK_ID: task_id, ApiField.FILES: batch }) if progress_cb is not None: progress_cb(len(remote_hashes)) for batch in batched(list(zip(abs_paths, names, hashes))): content_dict = OrderedDict() for idx, item in enumerate(batch): path, name, hash = item if hash in remote_hashes: continue content_dict["{}".format(idx)] = json.dumps({ "fullpath": name, "hash": hash }) content_dict["{}-file".format(idx)] = (name, open(path, 'rb'), '') if len(content_dict) > 0: encoder = MultipartEncoder(fields=content_dict) resp = self._api.post('tasks.files.bulk.upload', encoder) if progress_cb is not None: progress_cb(len(content_dict))
def _upload_bulk_add(self, func_item_to_kv, dataset_id, names, items, metas=None, progress_cb=None): if metas is None: metas = [{}] * len(items) results = [] if len(names) == 0: return results if len(names) != len(items): raise RuntimeError("Can not match \"names\" and \"items\" lists, len(names) != len(items)") for batch in batched(list(zip(names, items, metas))): images = [] for name, item, meta in batch: item_tuple = func_item_to_kv(item) images.append({ApiField.NAME: name, item_tuple[0]: item_tuple[1], ApiField.META: meta if meta is not None else {}}) response = self._api.post('point-clouds.bulk.add', {ApiField.DATASET_ID: dataset_id, ApiField.POINTCLOUDS: images}) if progress_cb is not None: progress_cb(len(images)) results.extend([self._convert_json_info(item) for item in response.json()]) name_to_res = {img_info.name: img_info for img_info in results} ordered_results = [name_to_res[name] for name in names] return ordered_results
def _upload_data_bulk(self, func_item_to_byte_stream, func_item_hash, items, progress_cb): hashes = [] if len(items) == 0: return hashes hash_to_items = defaultdict(list) for idx, item in enumerate(items): item_hash = func_item_hash(item) hashes.append(item_hash) hash_to_items[item_hash].append(item) unique_hashes = set(hashes) remote_hashes = self.check_existing_hashes(list(unique_hashes)) new_hashes = unique_hashes - set(remote_hashes) if progress_cb is not None: progress_cb(len(remote_hashes)) # upload only new images to supervisely server items_to_upload = [] for hash in new_hashes: items_to_upload.extend(hash_to_items[hash]) for batch in batched(items_to_upload): content_dict = {} for idx, item in enumerate(batch): content_dict["{}-file".format(idx)] = (str(idx), func_item_to_byte_stream(item), 'pcd/*') encoder = MultipartEncoder(fields=content_dict) self._api.post('point-clouds.bulk.upload', encoder) if progress_cb is not None: progress_cb(len(batch)) return hashes
def check_existing_hashes(self, hashes): results = [] if len(hashes) == 0: return results for hashes_batch in batched(hashes, batch_size=900): response = self._api.post('images.internal.hashes.list', hashes_batch) results.extend(response.json()) return results
def _append_bulk(self, entity_id, figures_json, figures_keys, key_id_map: KeyIdMap, field_name=ApiField.ENTITY_ID): if len(figures_json) == 0: return for (batch_keys, batch_jsons) in zip(batched(figures_keys, batch_size=100), batched(figures_json, batch_size=100)): resp = self._api.post('figures.bulk.add', { field_name: entity_id, ApiField.FIGURES: batch_jsons }) for key, resp_obj in zip(batch_keys, resp.json()): figure_id = resp_obj[ApiField.ID] key_id_map.add_figure(key, figure_id)
def _download_batch_by_hashes(self, hashes): for batch_hashes in batched(hashes): response = self._api.post( 'images.bulk.download-by-hash', {ApiField.HASHES: batch_hashes}) decoder = MultipartDecoder.from_response(response) for part in decoder.parts: content_utf8 = part.headers[b'Content-Disposition'].decode('utf-8') # Find name="1245" preceded by a whitespace, semicolon or beginning of line. # The regex has 2 capture group: one for the prefix and one for the actual name value. h = content_utf8.replace("form-data; name=\"", "")[:-1] yield h, part
def check_existing_hashes(self, hashes): ''' :param hashes: list of str :return: list of jsons objects(None if image with given hash not exist) ''' results = [] if len(hashes) == 0: return results for hashes_batch in batched(hashes, batch_size=900): response = self._api.post('images.internal.hashes.list', hashes_batch) results.extend(response.json()) return results
def _upload_batch(self, func_ann_to_json, img_ids, anns, progress_cb=None): # img_ids from the same dataset if len(img_ids) == 0: return if len(img_ids) != len(anns): raise RuntimeError('Can not match "img_ids" and "anns" lists, len(img_ids) != len(anns)') dataset_id = self._api.image.get_info_by_id(img_ids[0]).dataset_id for batch in batched(list(zip(img_ids, anns))): data = [{ApiField.IMAGE_ID: img_id, ApiField.ANNOTATION: func_ann_to_json(ann)} for img_id, ann in batch] self._api.post('annotations.bulk.add', data={ApiField.DATASET_ID: dataset_id, ApiField.ANNOTATIONS: data}) if progress_cb is not None: progress_cb(len(batch))
def _upload_data_bulk(self, func_item_to_byte_stream, items_hashes, retry_cnt=3, progress_cb=None, item_progress=None): hash_to_items = {i_hash: item for item, i_hash in items_hashes} unique_hashes = set(hash_to_items.keys()) remote_hashes = set(self.check_existing_hashes( list(unique_hashes))) # existing -- from server if progress_cb: progress_cb(len(remote_hashes)) #pending_hashes = unique_hashes #- remote_hashes #@TODO: only fo debug! pending_hashes = unique_hashes - remote_hashes for retry_idx in range(retry_cnt): # single attempt to upload all data which is not uploaded yet for hashes in batched(list(pending_hashes)): pending_hashes_items = [(h, hash_to_items[h]) for h in hashes] hashes_rcv = self._upload_uniq_videos_single_req( func_item_to_byte_stream, pending_hashes_items, item_progress) pending_hashes -= set(hashes_rcv) if set(hashes_rcv) - set(hashes): logger.warn('Hash inconsistency in images bulk upload.', extra={ 'sent': hashes, 'received': hashes_rcv }) if progress_cb: progress_cb(len(hashes_rcv)) if not pending_hashes: return logger.warn('Unable to upload videos (data).', extra={ 'retry_idx': retry_idx, 'items': [(h, hash_to_items[h]) for h in pending_hashes] }) # now retry it for the case if it is a shadow server/connection error raise RuntimeError( "Unable to upload videos (data). " "Please check if videos are in supported format and if ones aren't corrupted." )
def copy_batch(self, src_image_ids, dst_image_ids, progress_cb=None): if len(src_image_ids) != len(dst_image_ids): raise RuntimeError('Can not match "src_image_ids" and "dst_image_ids" lists, ' 'len(src_image_ids) != len(dst_image_ids)') if len(src_image_ids) == 0: return src_dataset_id = self._api.image.get_info_by_id(src_image_ids[0]).dataset_id for cur_batch in batched(list(zip(src_image_ids, dst_image_ids))): src_ids_batch, dst_ids_batch = zip(*cur_batch) ann_infos = self.download_batch(src_dataset_id, src_ids_batch) ann_jsons = [ann_info.annotation for ann_info in ann_infos] self.upload_jsons(dst_ids_batch, ann_jsons) if progress_cb is not None: progress_cb(len(src_ids_batch))
def _download_batch(self, dataset_id, ids): ''' Generate image id and it content from given dataset and list of images ids :param dataset_id: int :param ids: list of integers ''' for batch_ids in batched(ids): response = self._api.post( 'images.bulk.download', {ApiField.DATASET_ID: dataset_id, ApiField.IMAGE_IDS: batch_ids}) decoder = MultipartDecoder.from_response(response) for part in decoder.parts: content_utf8 = part.headers[b'Content-Disposition'].decode('utf-8') # Find name="1245" preceded by a whitespace, semicolon or beginning of line. # The regex has 2 capture group: one for the prefix and one for the actual name value. img_id = int(re.findall(r'(^|[\s;])name="(\d*)"', content_utf8)[0][1]) yield img_id, part
def get_info_by_id_batch(self, ids): ''' :param ids: list of integers :return: list of images metadata ''' results = [] if len(ids) == 0: return results dataset_id = self.get_info_by_id(ids[0]).dataset_id for batch in batched(ids): filters = [{"field": ApiField.ID, "operator": "in", "value": batch}] results.extend(self.get_list_all_pages('images.list', {ApiField.DATASET_ID: dataset_id, ApiField.FILTER: filters})) temp_map = {info.id: info for info in results} ordered_results = [temp_map[id] for id in ids] return ordered_results
def append_labels(self, image_id, labels): if len(labels) == 0: return payload = [] for label in labels: _label_json = label.to_json() _label_json["geometry"] = label.geometry.to_json() if "classId" not in _label_json: raise KeyError("Update project meta from server to get class id") payload.append(_label_json) added_ids = [] for batch_jsons in batched(payload, batch_size=100): resp = self._api.post('figures.bulk.add', {ApiField.ENTITY_ID: image_id, ApiField.FIGURES: batch_jsons}) for resp_obj in resp.json(): figure_id = resp_obj[ApiField.ID] added_ids.append(figure_id)
def _upload_data_bulk(self, func_item_to_byte_stream, items_hashes, retry_cnt=3, progress_cb=None): """ Upload images (binary data) to server. Works with already existing or duplicating images. :param func_item_to_byte_stream: converter for "item" to byte stream :param items_hashes: iterable of pairs (item, hash) where "item" is a some descriptor (e.g. image file path) for image data, and "hash" is a hash for the image binary data :param retry_cnt: int, number of retries to send the whole set of items :param progress_cb: callback to account progress (in number of items) """ hash_to_items = {i_hash: item for item, i_hash in items_hashes} unique_hashes = set(hash_to_items.keys()) remote_hashes = set(self.check_existing_hashes(list(unique_hashes))) # existing -- from server if progress_cb: progress_cb(len(remote_hashes)) pending_hashes = unique_hashes - remote_hashes # @TODO: some correlation with sly.io.network_exceptions. Should we perform retries here? for retry_idx in range(retry_cnt): # single attempt to upload all data which is not uploaded yet for hashes in batched(list(pending_hashes)): pending_hashes_items = [(h, hash_to_items[h]) for h in hashes] hashes_rcv = self._upload_uniq_images_single_req(func_item_to_byte_stream, pending_hashes_items) pending_hashes -= set(hashes_rcv) if set(hashes_rcv) - set(hashes): logger.warn('Hash inconsistency in images bulk upload.', extra={'sent': hashes, 'received': hashes_rcv}) if progress_cb: progress_cb(len(hashes_rcv)) if not pending_hashes: return logger.warn('Unable to upload images (data).', extra={ 'retry_idx': retry_idx, 'items': [(h, hash_to_items[h]) for h in pending_hashes] }) # now retry it for the case if it is a shadow server/connection error raise RuntimeError("Unable to upload images (data). " "Please check if images are in supported format and if ones aren't corrupted.")
def download_batch(self, dataset_id, image_ids, progress_cb=None, with_custom_data=False): ''' :param dataset_id: int :param image_ids: list of integers :param progress_cb: :return: list of serialized JSON annotations for the given dataset id and image id's ''' id_to_ann = {} for batch in batched(image_ids): post_data = { ApiField.DATASET_ID: dataset_id, ApiField.IMAGE_IDS: batch, ApiField.WITH_CUSTOM_DATA: with_custom_data } results = self._api.post('annotations.bulk.info', data=post_data).json() for ann_dict in results: ann_info = self._convert_json_info(ann_dict) id_to_ann[ann_info.image_id] = ann_info if progress_cb is not None: progress_cb(len(batch)) ordered_results = [id_to_ann[image_id] for image_id in image_ids] return ordered_results
def _upload_bulk_add(self, func_item_to_kv, dataset_id, names, items, progress_cb=None, metas=None): results = [] if len(names) == 0: return results if len(names) != len(items): raise RuntimeError("Can not match \"names\" and \"items\" lists, len(names) != len(items)") if metas is None: metas = [{}] * len(names) else: if len(names) != len(metas): raise RuntimeError("Can not match \"names\" and \"metas\" len(names) != len(metas)") for batch in batched(list(zip(names, items, metas))): images = [] for name, item, meta in batch: item_tuple = func_item_to_kv(item) #@TODO: 'title' -> ApiField.NAME image_data = {'title': name, item_tuple[0]: item_tuple[1]} if len(meta) != 0 and type(meta) == dict: image_data[ApiField.META] = meta images.append(image_data) response = self._api.post('images.bulk.add', {ApiField.DATASET_ID: dataset_id, ApiField.IMAGES: images}) if progress_cb is not None: progress_cb(len(images)) for info_json in response.json(): info_json_copy = info_json.copy() info_json_copy[ApiField.EXT] = info_json[ApiField.MIME].split('/')[1] #results.append(self.InfoType(*[info_json_copy[field_name] for field_name in self.info_sequence()])) results.append(self._convert_json_info(info_json_copy)) #name_to_res = {img_info.name: img_info for img_info in results} #ordered_results = [name_to_res[name] for name in names] return results #ordered_results
def remove_batch(self, ids, progress_cb=None): for ids_batch in batched(ids): self._api.post(self._remove_batch_api_method_name(), {self._remove_batch_field_name(): ids_batch}) if progress_cb is not None: progress_cb(len(ids_batch))
def download_pointcloud_episode_project(api, project_id, dest_dir, dataset_ids=None, download_pcd=True, download_realated_images=True, download_annotations=True, log_progress=False, batch_size=10): key_id_map = KeyIdMap() project_fs = PointcloudEpisodeProject(dest_dir, OpenMode.CREATE) meta = ProjectMeta.from_json(api.project.get_meta(project_id)) project_fs.set_meta(meta) datasets_infos = [] if dataset_ids is not None: for ds_id in dataset_ids: datasets_infos.append(api.dataset.get_info_by_id(ds_id)) else: datasets_infos = api.dataset.get_list(project_id) for dataset in datasets_infos: dataset_fs = project_fs.create_dataset(dataset.name) pointclouds = api.pointcloud_episode.get_list(dataset.id) if download_annotations: # Download annotation to project_path/dataset_path/annotation.json ann_json = api.pointcloud_episode.annotation.download(dataset.id) annotation = dataset_fs.annotation_class.from_json( ann_json, meta, key_id_map) dataset_fs.set_ann(annotation) # frames --> pointcloud mapping to project_path/dataset_path/frame_pointcloud_map.json frame_name_map = api.pointcloud_episode.get_frame_name_map( dataset.id) frame_pointcloud_map_path = dataset_fs.get_frame_pointcloud_map_path( ) dump_json_file(frame_name_map, frame_pointcloud_map_path) # Download data if log_progress: ds_progress = Progress('Downloading dataset: {!r}'.format( dataset.name), total_cnt=len(pointclouds)) for batch in batched(pointclouds, batch_size=batch_size): pointcloud_ids = [pointcloud_info.id for pointcloud_info in batch] pointcloud_names = [ pointcloud_info.name for pointcloud_info in batch ] for pointcloud_id, pointcloud_name in zip(pointcloud_ids, pointcloud_names): pointcloud_file_path = dataset_fs.generate_item_path( pointcloud_name) if download_pcd is True: api.pointcloud_episode.download_path( pointcloud_id, pointcloud_file_path) else: touch(pointcloud_file_path) if download_realated_images: related_images_path = dataset_fs.get_related_images_path( pointcloud_name) related_images = api.pointcloud_episode.get_list_related_images( pointcloud_id) for rimage_info in related_images: name = rimage_info[ApiField.NAME] rimage_id = rimage_info[ApiField.ID] path_img = os.path.join(related_images_path, name) path_json = os.path.join(related_images_path, name + ".json") api.pointcloud_episode.download_related_image( rimage_id, path_img) dump_json_file(rimage_info, path_json) dataset_fs.add_item_file(pointcloud_name, pointcloud_file_path, _validate_item=False) if log_progress: ds_progress.iters_done_report(len(batch)) project_fs.set_key_id_map(key_id_map)
def download_video_project(api, project_id, dest_dir, dataset_ids=None, download_videos=True, log_progress=False): ''' Download project with given id in destination directory :param api: Api class object :param project_id: int :param dest_dir: str :param dataset_ids: list of integers :param download_videos: bool :param log_progress: bool ''' LOG_BATCH_SIZE = 1 key_id_map = KeyIdMap() project_fs = VideoProject(dest_dir, OpenMode.CREATE) meta = ProjectMeta.from_json(api.project.get_meta(project_id)) project_fs.set_meta(meta) datasets_infos = [] if dataset_ids is not None: for ds_id in dataset_ids: datasets_infos.append(api.dataset.get_info_by_id(ds_id)) else: datasets_infos = api.dataset.get_list(project_id) for dataset in datasets_infos: dataset_fs = project_fs.create_dataset(dataset.name) videos = api.video.get_list(dataset.id) ds_progress = None if log_progress: ds_progress = Progress('Downloading dataset: {!r}'.format( dataset.name), total_cnt=len(videos)) for batch in batched(videos, batch_size=LOG_BATCH_SIZE): video_ids = [video_info.id for video_info in batch] video_names = [video_info.name for video_info in batch] ann_jsons = api.video.annotation.download_bulk( dataset.id, video_ids) for video_id, video_name, ann_json in zip(video_ids, video_names, ann_jsons): if video_name != ann_json[ApiField.VIDEO_NAME]: raise RuntimeError( "Error in api.video.annotation.download_batch: broken order" ) video_file_path = dataset_fs.generate_item_path(video_name) if download_videos is True: api.video.download_path(video_id, video_file_path) else: touch(video_file_path) dataset_fs.add_item_file(video_name, video_file_path, ann=VideoAnnotation.from_json( ann_json, project_fs.meta, key_id_map), _validate_item=False) ds_progress.iters_done_report(len(batch)) project_fs.set_key_id_map(key_id_map)
def download_pointcloud_project(api, project_id, dest_dir, dataset_ids=None, download_items=True, log_progress=False): LOG_BATCH_SIZE = 1 key_id_map = KeyIdMap() project_fs = PointcloudProject(dest_dir, OpenMode.CREATE) meta = ProjectMeta.from_json(api.project.get_meta(project_id)) project_fs.set_meta(meta) datasets_infos = [] if dataset_ids is not None: for ds_id in dataset_ids: datasets_infos.append(api.dataset.get_info_by_id(ds_id)) else: datasets_infos = api.dataset.get_list(project_id) for dataset in datasets_infos: dataset_fs = project_fs.create_dataset(dataset.name) pointclouds = api.pointcloud.get_list(dataset.id) ds_progress = None if log_progress: ds_progress = Progress('Downloading dataset: {!r}'.format( dataset.name), total_cnt=len(pointclouds)) for batch in batched(pointclouds, batch_size=LOG_BATCH_SIZE): pointcloud_ids = [pointcloud_info.id for pointcloud_info in batch] pointcloud_names = [ pointcloud_info.name for pointcloud_info in batch ] ann_jsons = api.pointcloud.annotation.download_bulk( dataset.id, pointcloud_ids) for pointcloud_id, pointcloud_name, ann_json in zip( pointcloud_ids, pointcloud_names, ann_jsons): if pointcloud_name != ann_json[ApiField.NAME]: raise RuntimeError( "Error in api.video.annotation.download_batch: broken order" ) pointcloud_file_path = dataset_fs.generate_item_path( pointcloud_name) if download_items is True: api.pointcloud.download_path(pointcloud_id, pointcloud_file_path) related_images_path = dataset_fs.get_related_images_path( pointcloud_name) related_images = api.pointcloud.get_list_related_images( pointcloud_id) for rimage_info in related_images: name = rimage_info[ApiField.NAME] if not has_valid_ext(name): new_name = get_file_name( name) # to fix cases like .png.json if has_valid_ext(new_name): name = new_name rimage_info[ApiField.NAME] = name else: raise RuntimeError( 'Something wrong with photo context filenames.\ Please, contact support') rimage_id = rimage_info[ApiField.ID] path_img = os.path.join(related_images_path, name) path_json = os.path.join(related_images_path, name + ".json") api.pointcloud.download_related_image( rimage_id, path_img) dump_json_file(rimage_info, path_json) else: touch(pointcloud_file_path) dataset_fs.add_item_file(pointcloud_name, pointcloud_file_path, ann=PointcloudAnnotation.from_json( ann_json, project_fs.meta, key_id_map), _validate_item=False) ds_progress.iters_done_report(len(batch)) project_fs.set_key_id_map(key_id_map)