def create_subset(self) -> Tuple[LightlySubset, List[str]]: base_dataset = LightlyDataset(input_dir=self.input_dir) filenames_base_dataset = base_dataset.get_filenames() no_samples_subset = int(len(filenames_base_dataset) * 0.5) filenames_subset = random.sample(filenames_base_dataset, no_samples_subset) subset = LightlySubset(base_dataset=base_dataset, filenames_subset=filenames_subset) return subset, filenames_subset
def create_video_subset(self, seed=0) -> Tuple[LightlySubset, List[str]]: random.seed(seed) self.create_video_dataset(n_videos=5, n_frames_per_video=10) base_dataset = LightlyDataset(self.input_dir) filenames_base_dataset = base_dataset.get_filenames() no_samples_subset = int(len(filenames_base_dataset) * 0.5) filenames_subset = random.sample(filenames_base_dataset, no_samples_subset) subset = LightlySubset(base_dataset=base_dataset, filenames_subset=filenames_subset) return subset, filenames_subset
def create_new_dataset_with_embeddings(path_to_dataset: str, token: str, dataset_name: str) -> ApiWorkflowClient: api_workflow_client = ApiWorkflowClient(token=token) # create the dataset api_workflow_client.create_new_dataset_with_unique_name(dataset_basename=dataset_name) # upload to the dataset api_workflow_client.upload_dataset(input=path_to_dataset) # calculate and save the embeddings path_to_embeddings_csv = f"{path_to_dataset}/embeddings.csv" if not os.path.isfile(path_to_embeddings_csv): dataset = LightlyDataset(input_dir=path_to_dataset) embeddings = np.random.normal(size=(len(dataset.dataset.samples), 32)) filepaths, labels = zip(*dataset.dataset.samples) filenames = [filepath[len(path_to_dataset):].lstrip('/') for filepath in filepaths] print("Starting save of embeddings") save_embeddings(path_to_embeddings_csv, embeddings, labels, filenames) print("Finished save of embeddings") # upload the embeddings api_workflow_client.upload_embeddings(path_to_embeddings_csv=path_to_embeddings_csv, name="embedding_1") return api_workflow_client
def t_est_api_with_matrix(path_to_dataset: str, token: str, dataset_name: str = "test_api_from_pip"): no_samples = len(LightlyDataset(input_dir=path_to_dataset).dataset.samples) assert no_samples >= 100, "Test needs at least 100 samples in the dataset!" api_workflow_client = create_new_dataset_with_embeddings( path_to_dataset=path_to_dataset, token=token, dataset_name=dataset_name) for method in [ SamplingMethod.CORAL, SamplingMethod.CORESET, SamplingMethod.RANDOM ]: for query_tag_name in ['initial-tag', "query_tag_name_xyz"]: for preselected_tag_name in [None, "preselected_tag_name_xyz"]: print( f"Starting AL run with method '{method}', query_tag '{query_tag_name}' " f"and preselected_tag '{preselected_tag_name}'.") t_est_active_learning(api_workflow_client, method, query_tag_name, preselected_tag_name) api_workflow_client.delete_dataset_by_id(api_workflow_client.dataset_id) print( "Success of the complete test suite! The dataset on the server was deleted again." )
def create_new_dataset_with_embeddings(path_to_dataset: str, token: str, dataset_name: str) -> ApiWorkflowClient: api_workflow_client = ApiWorkflowClient(token=token) # create the dataset api_workflow_client.create_new_dataset_with_unique_name(dataset_basename=dataset_name) # upload to the dataset initialize(config_path="../../lightly/cli/config", job_name="test_app") cfg = compose(config_name="config", overrides=[ f"input_dir='{path_to_dataset}'", f"token='{token}'", f"dataset_id={api_workflow_client.dataset_id}" ]) upload_cli(cfg) # calculate and save the embeddings path_to_embeddings_csv = f"{path_to_dataset}/embeddings.csv" if not os.path.isfile(path_to_embeddings_csv): dataset = LightlyDataset(input_dir=path_to_dataset) embeddings = np.random.normal(size=(len(dataset.dataset.samples), 32)) filepaths, labels = zip(*dataset.dataset.samples) filenames = [filepath[len(path_to_dataset):].lstrip('/') for filepath in filepaths] print("Starting save of embeddings") save_embeddings(path_to_embeddings_csv, embeddings, labels, filenames) print("Finished save of embeddings") # upload the embeddings print("Starting upload of embeddings.") api_workflow_client.upload_embeddings(path_to_embeddings_csv=path_to_embeddings_csv, name="embedding_1") print("Finished upload of embeddings.") return api_workflow_client
def test_lightly_subset_dump(self): subset, filenames_subset = self.create_subset() dataset = subset out_dir = tempfile.mkdtemp() dataset.dump(out_dir) files_output_dir = LightlyDataset(input_dir=out_dir).get_filenames() assert set(files_output_dir) == set(dataset.get_filenames())
def __init__(self, base_dataset: LightlyDataset, filenames_subset: List[str]): """Creates a subset of a LightlyDataset. Args: base_dataset: The dataset to subset from. filenames_subset: The filenames of the samples to be part of the subset. """ self.base_dataset = base_dataset self.filenames_subset = filenames_subset dict_base_dataset_filename_index: Dict[str, int] = dict() for index in range(len(base_dataset)): fname = base_dataset.index_to_filename(self.dataset, index) dict_base_dataset_filename_index[fname] = index self.mapping_subset_index_to_baseset_index = \ [dict_base_dataset_filename_index[filename] for filename in filenames_subset]
def upload_dataset(self, input: Union[str, LightlyDataset], max_workers: int = 8, mode: str = 'thumbnails', verbose: bool = True): """Uploads a dataset to to the Lightly cloud solution. Args: input: one of the following: - the path to the dataset, e.g. "path/to/dataset" - the dataset in form of a LightlyDataset max_workers: Maximum number of workers uploading images in parallel. max_requests: Maximum number of requests a single worker can do before he has to wait for the others. mode: One of [full, thumbnails, metadata]. Whether to upload thumbnails, full images, or metadata only. Raises: ValueError if dataset is too large or input has the wrong type RuntimeError if the connection to the server failed. """ no_tags_on_server = len(self._get_all_tags()) if no_tags_on_server > 0: warnings.warn( f"Dataset with id {self.dataset_id} has already been completely uploaded to the platform. Skipping upload." ) return # Check input variable 'input' if isinstance(input, str): dataset = LightlyDataset(input_dir=input) elif isinstance(input, LightlyDataset): dataset = input else: raise ValueError( f"input must either be a LightlyDataset or the path to the dataset as str, " f"but is of type {type(input)}") # check the allowed dataset size max_dataset_size_str = self.quota_api.get_quota_maximum_dataset_size() max_dataset_size = int(max_dataset_size_str) if len(dataset) > max_dataset_size: msg = f'Your dataset has {len(dataset)} samples which' msg += f' is more than the allowed maximum of {max_dataset_size}' raise ValueError(msg) # handle the case where len(dataset) < max_workers max_workers = min(len(dataset), max_workers) max_workers = max(max_workers, 1) # upload the samples if verbose: print(f'Uploading images (with {max_workers} workers).', flush=True) pbar = tqdm.tqdm(unit='imgs', total=len(dataset)) tqdm_lock = tqdm.tqdm.get_lock() # define lambda function for concurrent upload def lambda_(i): # load image image, label, filename = dataset[i] # try to upload image try: self._upload_single_image( image=image, label=label, filename=filename, mode=mode, ) success = True except Exception as e: warnings.warn( f"Upload of image {filename} failed with error {e}") success = False # update the progress bar tqdm_lock.acquire() # lock pbar.update(1) # update tqdm_lock.release() # unlock # return whether the upload was successful return success with ThreadPoolExecutor(max_workers=max_workers) as executor: results = list( executor.map(lambda_, [i for i in range(len(dataset))], chunksize=1)) if not all(results): msg = 'Warning: Unsuccessful upload(s)! ' msg += 'This could cause problems when uploading embeddings.' msg += 'Failed at image: {}'.format(results.index(False)) warnings.warn(msg) # set image type of data and create initial tag if mode == 'full': img_type = 'full' elif mode == 'thumbnails': img_type = 'thumbnail' else: img_type = 'meta' initial_tag_create_request = InitialTagCreateRequest( img_type=img_type, creator=TagCreator.USER_PIP) self.tags_api.create_initial_tag_by_dataset_id( body=initial_tag_create_request, dataset_id=self.dataset_id)
def test_upload_dataset_from_dataset(self): dataset = LightlyDataset.from_torch_dataset(self.dataset) self.api_workflow_client.upload_dataset(input=dataset)
def upload_dataset(self, input: Union[str, LightlyDataset], max_workers: int = 8, mode: str = 'thumbnails', verbose: bool = True, custom_metadata: Union[Dict, None] = None): """Uploads a dataset to to the Lightly cloud solution. Args: input: Either the path to the dataset, e.g. "path/to/dataset", or the dataset in form of a LightlyDataset max_workers: Maximum number of workers uploading images in parallel. max_requests: Maximum number of requests a single worker can do before he has to wait for the others. mode: One of [full, thumbnails, metadata]. Whether to upload thumbnails, full images, or metadata only. Raises: ValueError: If dataset is too large or input has the wrong type RuntimeError: If the connection to the server failed. """ # get all tags of the dataset tags = self.get_all_tags() if len(tags) > 0: print(f'Dataset with id {self.dataset_id} has {len(tags)} tags.', flush=True) # parse "input" variable if isinstance(input, str): dataset = LightlyDataset(input_dir=input) elif isinstance(input, LightlyDataset): dataset = input else: raise ValueError( f'input must either be a LightlyDataset or the path to the' f'dataset as str, but has type {type(input)}') # handle the case where len(dataset) < max_workers max_workers = min(len(dataset), max_workers) max_workers = max(max_workers, 1) # upload the samples if verbose: print(f'Uploading images (with {max_workers} workers).', flush=True) # TODO: remove _size_in_bytes from image_processing image_processing.metadata._size_in_bytes = \ lambda img: 0 # pylint: disable=protected-access # get the filenames of the samples already on the server samples = self._samples_api.get_samples_by_dataset_id( dataset_id=self.dataset_id) filenames_on_server = [sample.file_name for sample in samples] filenames_on_server_set = set(filenames_on_server) if len(filenames_on_server) > 0: print( f'Found {len(filenames_on_server)} images already on the server' ', they are skipped during the upload.') # check the maximum allowed dataset size total_filenames = set( dataset.get_filenames()).union(filenames_on_server_set) max_dataset_size = \ int(self._quota_api.get_quota_maximum_dataset_size()) if len(total_filenames) > max_dataset_size: msg = f'Your dataset has {len(dataset)} samples which' msg += f' is more than the allowed maximum of {max_dataset_size}' raise ValueError(msg) # index custom metadata by filename (only if it exists) filename_to_metadata = {} if custom_metadata is not None: self.verify_custom_metadata_format(custom_metadata) filename_to_metadata = self.index_custom_metadata_by_filename( dataset.get_filenames(), custom_metadata, ) # get the datasource try: datasource_config: DatasourceConfigBase = self.get_datasource() datasource_type = datasource_config['type'] except ApiException: datasource_type = 'LIGHTLY' # default to lightly datasource # register dataset upload job_status_meta = JobStatusMeta( total=len(total_filenames), processed=len(filenames_on_server), is_registered=True, upload_method=JobStatusUploadMethod.USER_PIP, ) self._datasets_api.register_dataset_upload_by_id( job_status_meta, self.dataset_id) pbar = tqdm.tqdm( unit='imgs', total=len(total_filenames) - len(filenames_on_server), ) tqdm_lock = tqdm.tqdm.get_lock() # define lambda function for concurrent upload def lambda_(i): # load image image, _, filename = dataset[i] if filename in filenames_on_server_set: # the sample was already uploaded return True filepath = dataset.get_filepath_from_filename(filename, image) # get custom metadata (evaluates to None if there is none) custom_metadata_item = filename_to_metadata.get(filename, None) # try to upload image try: self._upload_single_image( image=image, filename=filename, filepath=filepath, mode=mode, custom_metadata=custom_metadata_item, datasource_type=datasource_type, ) success = True except Exception as e: # pylint: disable=broad-except warnings.warn( f'Upload of image {filename} failed with error {e}') success = False # update the progress bar tqdm_lock.acquire() pbar.update(1) tqdm_lock.release() # return whether the upload was successful return success with ThreadPoolExecutor(max_workers=max_workers) as executor: results = list( executor.map(lambda_, [i for i in range(len(dataset))], chunksize=1)) if not all(results): msg = 'Warning: Unsuccessful upload(s)! ' msg += 'This could cause problems when uploading embeddings.' msg += 'Failed at image: {}'.format(results.index(False)) warnings.warn(msg) # set image type of data and create initial tag if mode == 'full': img_type = 'full' elif mode == 'thumbnails': img_type = 'thumbnail' else: img_type = 'meta' if len(tags) == 0: # create initial tag initial_tag_create_request = InitialTagCreateRequest( img_type=img_type, creator=TagCreator.USER_PIP) self._tags_api.create_initial_tag_by_dataset_id( body=initial_tag_create_request, dataset_id=self.dataset_id, ) else: # upsize existing tags upsize_tags_request = TagUpsizeRequest( upsize_tag_name=datetime.now().strftime('%Y%m%d_%Hh%Mm%Ss'), upsize_tag_creator=TagCreator.USER_PIP, ) self._tags_api.upsize_tags_by_dataset_id( body=upsize_tags_request, dataset_id=self.dataset_id, )