예제 #1
0
    def _upload_bulk_add(self,
                         func_item_to_kv,
                         dataset_id,
                         names,
                         items,
                         progress_cb=None):
        results = []

        if len(names) == 0:
            return results
        if len(names) != len(items):
            raise RuntimeError(
                "Can not match \"names\" and \"items\" lists, len(names) != len(items)"
            )

        for batch in batched(list(zip(names, items))):
            images = []
            for name, item in batch:
                item_tuple = func_item_to_kv(item)
                #@TODO: 'title' -> ApiField.NAME
                images.append({'title': name, item_tuple[0]: item_tuple[1]})
            response = self._api.post('images.bulk.add', {
                ApiField.DATASET_ID: dataset_id,
                ApiField.IMAGES: images
            })
            if progress_cb is not None:
                progress_cb(len(images))

            for info_json in response.json():
                info_json_copy = info_json.copy()
                info_json_copy[ApiField.EXT] = info_json[ApiField.MIME].split(
                    '/')[1]
                results.append(
                    self.InfoType(*[
                        info_json_copy[field_name]
                        for field_name in self.info_sequence()
                    ]))

        name_to_res = {img_info.name: img_info for img_info in results}
        ordered_results = [name_to_res[name] for name in names]

        return ordered_results
예제 #2
0
def _download_project(api,
                      project_id,
                      dest_dir,
                      dataset_ids=None,
                      log_progress=False,
                      batch_size=10):
    dataset_ids = set(dataset_ids) if (dataset_ids is not None) else None
    project_fs = Project(dest_dir, OpenMode.CREATE)
    meta = ProjectMeta.from_json(api.project.get_meta(project_id))
    project_fs.set_meta(meta)

    for dataset_info in api.dataset.get_list(project_id):
        dataset_id = dataset_info.id
        if dataset_ids is not None and dataset_id not in dataset_ids:
            continue

        dataset_fs = project_fs.create_dataset(dataset_info.name)
        images = api.image.get_list(dataset_id)

        ds_progress = None
        if log_progress:
            ds_progress = Progress('Downloading dataset: {!r}'.format(
                dataset_info.name),
                                   total_cnt=len(images))

        for batch in batched(images, batch_size):
            image_ids = [image_info.id for image_info in batch]
            image_names = [image_info.name for image_info in batch]

            # download images in numpy format
            batch_imgs_bytes = api.image.download_bytes(dataset_id, image_ids)

            # download annotations in json format
            ann_infos = api.annotation.download_batch(dataset_id, image_ids)
            ann_jsons = [ann_info.annotation for ann_info in ann_infos]

            for name, img_bytes, ann in zip(image_names, batch_imgs_bytes,
                                            ann_jsons):
                dataset_fs.add_item_raw_bytes(name, img_bytes, ann)

            if log_progress:
                ds_progress.iters_done_report(len(batch))
예제 #3
0
    def _upload_data_bulk(self, func_item_to_byte_stream, items_hashes, retry_cnt=3, progress_cb=None):
        """
        Upload images (binary data) to server. Works with already existing or duplicating images.
        :param func_item_to_byte_stream: converter for "item" to byte stream
        :param items_hashes: iterable of pairs (item, hash) where "item" is a some descriptor (e.g. image file path)
         for image data, and "hash" is a hash for the image binary data
        :param retry_cnt: int, number of retries to send the whole set of items
        :param progress_cb: callback to account progress (in number of items)
        """
        hash_to_items = {i_hash: item for item, i_hash in items_hashes}

        unique_hashes = set(hash_to_items.keys())
        remote_hashes = set(self.check_existing_hashes(list(unique_hashes)))  # existing -- from server
        if progress_cb:
            progress_cb(len(remote_hashes))
        pending_hashes = unique_hashes - remote_hashes

        # @TODO: some correlation with sly.io.network_exceptions. Should we perform retries here?
        for retry_idx in range(retry_cnt):
            # single attempt to upload all data which is not uploaded yet

            for hashes in batched(list(pending_hashes)):
                pending_hashes_items = [(h, hash_to_items[h]) for h in hashes]
                hashes_rcv = self._upload_uniq_images_single_req(func_item_to_byte_stream, pending_hashes_items)
                pending_hashes -= set(hashes_rcv)
                if set(hashes_rcv) - set(hashes):
                    logger.warn('Hash inconsistency in images bulk upload.',
                                extra={'sent': hashes, 'received': hashes_rcv})
                if progress_cb:
                    progress_cb(len(hashes_rcv))

            if not pending_hashes:
                return

            logger.warn('Unable to upload images (data).', extra={
                'retry_idx': retry_idx,
                'items': [(h, hash_to_items[h]) for h in pending_hashes]
            })
            # now retry it for the case if it is a shadow server/connection error

        raise RuntimeError("Unable to upload images (data). "
                           "Please check if images are in supported format and if ones aren't corrupted.")
예제 #4
0
    def download_import_files(self, task_id, data_dir):
        import_struct = self.api.simple_request('GetImportStructure',
                                                sly.api_proto.ListFiles,
                                                sly.api_proto.Id(id=task_id))
        progress = sly.Progress('Downloading', len(import_struct.files),
                                self.logger)

        def maybe_close_fh(fh, pbar, downloaded_paths: set):
            if fh is not None:
                if fh.close_and_check():
                    pbar.iter_done_report()
                    downloaded_paths.add(fh.file_path)
                else:
                    self.logger.warning('file was skipped while downloading',
                                        extra={'file_path': fh.file_path})

        files_to_download = list(import_struct.files)
        for batch in batched(files_to_download):
            # Store the file names that have been already downloaded from this batch
            # to avoid rewriting them on transmission retries if connection issues arise.
            downloaded_from_batch = set()
            file_handler = None
            for chunk in self.api.get_stream_with_data(
                    'GetImportFiles', sly.api_proto.ChunkFile,
                    sly.api_proto.ImportRequest(task_id=task_id, files=batch)):
                new_fpath = chunk.file.path
                if new_fpath:  # non-empty
                    maybe_close_fh(file_handler, progress,
                                   downloaded_from_batch)
                    real_fpath = os.path.join(data_dir, new_fpath.lstrip('/'))
                    if real_fpath in downloaded_from_batch:
                        file_handler = None
                    else:
                        self.logger.trace('download import file',
                                          extra={'file_path': real_fpath})
                        file_handler = sly.ChunkedFileWriter(
                            file_path=real_fpath)

                if file_handler is not None:
                    file_handler.write(chunk.chunk)

            maybe_close_fh(file_handler, progress, downloaded_from_batch)
예제 #5
0
 def _download_batch(self, dataset_id, ids):
     '''
     Generate image id and it content from given dataset and list of images ids
     :param dataset_id: int
     :param ids: list of integers
     '''
     for batch_ids in batched(ids):
         response = self._api.post('images.bulk.download', {
             ApiField.DATASET_ID: dataset_id,
             ApiField.IMAGE_IDS: batch_ids
         })
         decoder = MultipartDecoder.from_response(response)
         for part in decoder.parts:
             content_utf8 = part.headers[b'Content-Disposition'].decode(
                 'utf-8')
             # Find name="1245" preceded by a whitespace, semicolon or beginning of line.
             # The regex has 2 capture group: one for the prefix and one for the actual name value.
             img_id = int(
                 re.findall(r'(^|[\s;])name="(\d*)"', content_utf8)[0][1])
             yield img_id, part
예제 #6
0
    def _upload_bulk_add(self,
                         func_item_to_kv,
                         dataset_id,
                         names,
                         items,
                         metas=None,
                         progress_cb=None):
        if metas is None:
            metas = [{}] * len(items)

        results = []
        if len(names) == 0:
            return results
        if len(names) != len(items):
            raise RuntimeError(
                "Can not match \"names\" and \"items\" lists, len(names) != len(items)"
            )

        for batch in batched(list(zip(names, items, metas))):
            images = []
            for name, item, meta in batch:
                item_tuple = func_item_to_kv(item)
                images.append({
                    ApiField.NAME: name,
                    item_tuple[0]: item_tuple[1],
                    ApiField.META: meta if meta is not None else {}
                })
            response = self._api.post('point-clouds.bulk.add', {
                ApiField.DATASET_ID: dataset_id,
                ApiField.POINTCLOUDS: images
            })
            if progress_cb is not None:
                progress_cb(len(images))

            results = [
                self._convert_json_info(item) for item in response.json()
            ]
            name_to_res = {img_info.name: img_info for img_info in results}
            ordered_results = [name_to_res[name] for name in names]

            return ordered_results
예제 #7
0
    def _upload_batch(self, func_ann_to_json, img_ids, anns, progress_cb=None):
        # img_ids from the same dataset
        if len(img_ids) == 0:
            return
        if len(img_ids) != len(anns):
            raise RuntimeError(
                'Can not match "img_ids" and "anns" lists, len(img_ids) != len(anns)'
            )

        dataset_id = self._api.image.get_info_by_id(img_ids[0]).dataset_id
        for batch in batched(list(zip(img_ids, anns))):
            data = [{
                ApiField.IMAGE_ID: img_id,
                ApiField.ANNOTATION: func_ann_to_json(ann)
            } for img_id, ann in batch]
            self._api.post('annotations.bulk.add',
                           data={
                               ApiField.DATASET_ID: dataset_id,
                               ApiField.ANNOTATIONS: data
                           })
            if progress_cb is not None:
                progress_cb(len(batch))
예제 #8
0
 def upload_batch_paths(self,
                        dataset_id,
                        img_ids,
                        ann_paths,
                        progress_cb=None):
     MAX_BATCH_SIZE = 50
     for batch in batched(list(zip(img_ids, ann_paths)), MAX_BATCH_SIZE):
         data = []
         for img_id, ann_path in batch:
             with open(ann_path) as json_file:
                 ann_json = json.load(json_file)
             data.append({
                 ApiField.IMAGE_ID: img_id,
                 ApiField.ANNOTATION: ann_json
             })
         self.api.post('annotations.bulk.add',
                       data={
                           ApiField.DATASET_ID: dataset_id,
                           ApiField.ANNOTATIONS: data
                       })
         if progress_cb is not None:
             progress_cb(len(batch))
예제 #9
0
 def get_info_by_id_batch(self, ids):
     '''
     :param ids: list of integers
     :return: list of images metadata
     '''
     results = []
     if len(ids) == 0:
         return results
     dataset_id = self.get_info_by_id(ids[0]).dataset_id
     for batch in batched(ids):
         filters = [{
             "field": ApiField.ID,
             "operator": "in",
             "value": batch
         }]
         results.extend(
             self.get_list_all_pages('images.list', {
                 ApiField.DATASET_ID: dataset_id,
                 ApiField.FILTER: filters
             }))
     temp_map = {info.id: info for info in results}
     ordered_results = [temp_map[id] for id in ids]
     return ordered_results
예제 #10
0
    def _upload_bulk_add(self,
                         func_item_to_kv,
                         dataset_id,
                         names,
                         items,
                         progress_cb=None):
        results = []

        if len(names) == 0:
            return results
        if len(names) != len(items):
            raise RuntimeError(
                "Can not match \"names\" and \"items\" lists, len(names) != len(items)"
            )

        for batch in batched(list(zip(names, items))):
            images = []
            for name, item in batch:
                item_tuple = func_item_to_kv(item)
                #@TODO: 'title' -> ApiField.NAME
                images.append({'title': name, item_tuple[0]: item_tuple[1]})
            response = self.api.post('images.bulk.add', {
                ApiField.DATASET_ID: dataset_id,
                ApiField.IMAGES: images
            })
            if progress_cb is not None:
                progress_cb(len(images))
            results.extend([
                self._convert_json_info(info_json)
                for info_json in response.json()
            ])

        name_to_res = {img_info.name: img_info for img_info in results}
        ordered_results = [name_to_res[name] for name in names]

        return ordered_results
예제 #11
0
    def _upload_data_bulk(self, func_item_to_byte_stream, func_item_hash,
                          items, progress_cb):
        hashes = []
        if len(items) == 0:
            return hashes

        hash_to_items = defaultdict(list)

        for idx, item in enumerate(items):
            item_hash = func_item_hash(item)
            hashes.append(item_hash)
            hash_to_items[item_hash].append(item)

        unique_hashes = set(hashes)
        remote_hashes = self.check_existing_hashes(list(unique_hashes))
        new_hashes = unique_hashes - set(remote_hashes)

        if progress_cb is not None:
            progress_cb(len(remote_hashes))

        # upload only new images to supervisely server
        items_to_upload = []
        for hash in new_hashes:
            items_to_upload.extend(hash_to_items[hash])

        for batch in batched(items_to_upload):
            content_dict = {}
            for idx, item in enumerate(batch):
                content_dict["{}-file".format(idx)] = (
                    str(idx), func_item_to_byte_stream(item), 'image/*')
            encoder = MultipartEncoder(fields=content_dict)
            self.api.post('images.bulk.upload', encoder)
            if progress_cb is not None:
                progress_cb(len(batch))

        return hashes
예제 #12
0
 def remove_batch(self, ids, progress_cb=None):
     for ids_batch in batched(ids):
         self._api.post(self._remove_batch_api_method_name(),
                        {self._remove_batch_field_name(): ids_batch})
         if progress_cb is not None:
             progress_cb(len(ids_batch))
예제 #13
0
def download_video_project(api,
                           project_id,
                           dest_dir,
                           dataset_ids=None,
                           download_videos=True,
                           log_progress=False):
    '''
    Download project with given id in destination directory
    :param api: Api class object
    :param project_id: int
    :param dest_dir: str
    :param dataset_ids: list of integers
    :param download_videos: bool
    :param log_progress: bool
    '''
    LOG_BATCH_SIZE = 1

    key_id_map = KeyIdMap()

    project_fs = VideoProject(dest_dir, OpenMode.CREATE)

    meta = ProjectMeta.from_json(api.project.get_meta(project_id))
    project_fs.set_meta(meta)

    datasets_infos = []
    if dataset_ids is not None:
        for ds_id in dataset_ids:
            datasets_infos.append(api.dataset.get_info_by_id(ds_id))
    else:
        datasets_infos = api.dataset.get_list(project_id)

    for dataset in datasets_infos:
        dataset_fs = project_fs.create_dataset(dataset.name)
        videos = api.video.get_list(dataset.id)

        ds_progress = None
        if log_progress:
            ds_progress = Progress('Downloading dataset: {!r}'.format(
                dataset.name),
                                   total_cnt=len(videos))
        for batch in batched(videos, batch_size=LOG_BATCH_SIZE):
            video_ids = [video_info.id for video_info in batch]
            video_names = [video_info.name for video_info in batch]

            ann_jsons = api.video.annotation.download_bulk(
                dataset.id, video_ids)

            for video_id, video_name, ann_json in zip(video_ids, video_names,
                                                      ann_jsons):
                if video_name != ann_json[ApiField.VIDEO_NAME]:
                    raise RuntimeError(
                        "Error in api.video.annotation.download_batch: broken order"
                    )

                video_file_path = dataset_fs.generate_item_path(video_name)
                if download_videos is True:
                    api.video.download_path(video_id, video_file_path)
                else:
                    touch(video_file_path)

                dataset_fs.add_item_file(video_name,
                                         video_file_path,
                                         ann=VideoAnnotation.from_json(
                                             ann_json, project_fs.meta,
                                             key_id_map),
                                         _validate_item=False)

            ds_progress.iters_done_report(len(batch))

    project_fs.set_key_id_map(key_id_map)
예제 #14
0
def _download_dataset(api: Api,
                      dataset,
                      dataset_id,
                      cache=None,
                      progress_cb=None):
    images = api.image.get_list(dataset_id)

    images_to_download = images

    # copy images from cache to task folder and download corresponding annotations
    if cache:
        images_to_download, images_in_cache, images_cache_paths = _split_images_by_cache(
            images, cache)
        if len(images_to_download) + len(images_in_cache) != len(images):
            raise RuntimeError(
                "Error with images cache during download. Please contact support."
            )
        logger.info(f"Download dataset: {dataset.name}",
                    extra={
                        "total": len(images),
                        "in cache": len(images_in_cache),
                        "to download": len(images_to_download)
                    })
        if len(images_in_cache) > 0:
            img_cache_ids = [img_info.id for img_info in images_in_cache]
            ann_info_list = api.annotation.download_batch(
                dataset_id, img_cache_ids, progress_cb)
            img_name_to_ann = {
                ann.image_id: ann.annotation
                for ann in ann_info_list
            }
            for batch in batched(list(zip(images_in_cache,
                                          images_cache_paths)),
                                 batch_size=50):
                for img_info, img_cache_path in batch:
                    item_name = _maybe_append_image_extension(
                        img_info.name, img_info.ext)
                    dataset.add_item_file(item_name,
                                          img_cache_path,
                                          img_name_to_ann[img_info.id],
                                          _validate_item=False,
                                          _use_hardlink=True)
                progress_cb(len(batch))

    # download images from server
    if len(images_to_download) > 0:
        # prepare lists for api methods
        img_ids = []
        img_paths = []
        for img_info in images_to_download:
            img_ids.append(img_info.id)
            # TODO download to a temp file and use dataset api to add the image to the dataset.
            img_paths.append(
                os.path.join(
                    dataset.img_dir,
                    _maybe_append_image_extension(img_info.name,
                                                  img_info.ext)))

        # download annotations
        ann_info_list = api.annotation.download_batch(dataset_id, img_ids,
                                                      progress_cb)
        img_name_to_ann = {
            ann.image_id: ann.annotation
            for ann in ann_info_list
        }
        api.image.download_paths(dataset_id, img_ids, img_paths, progress_cb)
        for img_info, img_path in zip(images_to_download, img_paths):
            dataset.add_item_file(img_info.name, img_path,
                                  img_name_to_ann[img_info.id])

        if cache:
            img_hashes = [img_info.hash for img_info in images_to_download]
            cache.write_objects(img_paths, img_hashes)
예제 #15
0
def download_pointcloud_project(api, project_id, dest_dir, dataset_ids=None, download_items=True, log_progress=False):
    LOG_BATCH_SIZE = 1

    key_id_map = KeyIdMap()

    project_fs = PointcloudProject(dest_dir, OpenMode.CREATE)

    meta = ProjectMeta.from_json(api.project.get_meta(project_id))
    project_fs.set_meta(meta)

    datasets_infos = []
    if dataset_ids is not None:
        for ds_id in dataset_ids:
            datasets_infos.append(api.dataset.get_info_by_id(ds_id))
    else:
        datasets_infos = api.dataset.get_list(project_id)

    for dataset in datasets_infos:
        dataset_fs = project_fs.create_dataset(dataset.name)
        pointclouds = api.pointcloud.get_list(dataset.id)

        ds_progress = None
        if log_progress:
            ds_progress = Progress('Downloading dataset: {!r}'.format(dataset.name), total_cnt=len(pointclouds))
        for batch in batched(pointclouds, batch_size=LOG_BATCH_SIZE):
            pointcloud_ids = [pointcloud_info.id for pointcloud_info in batch]
            pointcloud_names = [pointcloud_info.name for pointcloud_info in batch]

            ann_jsons = api.pointcloud.annotation.download_bulk(dataset.id, pointcloud_ids)

            for pointcloud_id, pointcloud_name, ann_json in zip(pointcloud_ids, pointcloud_names, ann_jsons):
                if pointcloud_name != ann_json[ApiField.NAME]:
                    raise RuntimeError("Error in api.video.annotation.download_batch: broken order")

                pointcloud_file_path = dataset_fs.generate_item_path(pointcloud_name)
                if download_items is True:
                    api.pointcloud.download_path(pointcloud_id, pointcloud_file_path)

                    related_images_path = dataset_fs.get_related_images_path(pointcloud_name)
                    related_images = api.pointcloud.get_list_related_images(pointcloud_id)
                    for rimage_info in related_images:
                        name = rimage_info[ApiField.NAME]
                        rimage_id = rimage_info[ApiField.ID]

                        path_img = os.path.join(related_images_path, name)
                        path_json = os.path.join(related_images_path, name + ".json")

                        api.pointcloud.download_related_image(rimage_id, path_img)
                        dump_json_file(rimage_info, path_json)

                else:
                    touch(pointcloud_file_path)

                dataset_fs.add_item_file(pointcloud_name,
                                         pointcloud_file_path,
                                         ann=PointcloudAnnotation.from_json(ann_json, project_fs.meta, key_id_map),
                                         _validate_item=False)

            ds_progress.iters_done_report(len(batch))

    project_fs.set_key_id_map(key_id_map)