def _download_cli(cfg, is_cli_call=True): tag_name = cfg['tag_name'] dataset_id = cfg['dataset_id'] token = cfg['token'] if not tag_name: print('Please specify a tag name') print('For help, try: lightly-download --help') return if not token or not dataset_id: print('Please specify your access token and dataset id') print('For help, try: lightly-download --help') return api_workflow_client = ApiWorkflowClient(token=token, dataset_id=dataset_id) # get tag id tag_name_id_dict = dict([tag.name, tag.id] for tag in api_workflow_client._get_all_tags()) tag_id = tag_name_id_dict.get(tag_name, None) if tag_id is None: print(f'The specified tag {tag_name} does not exist.') return # get tag data tag_data = api_workflow_client.tags_api.get_tag_by_tag_id( dataset_id=dataset_id, tag_id=tag_id) # get samples chosen_samples_ids = BitMask.from_hex(tag_data.bit_mask_data).to_indices() samples = [ api_workflow_client.filenames_on_server[i] for i in chosen_samples_ids ] # store sample names in a .txt file with open(cfg['tag_name'] + '.txt', 'w') as f: for item in samples: f.write("%s\n" % item) msg = 'The list of files in tag {} is stored at: '.format(cfg['tag_name']) msg += os.path.join(os.getcwd(), cfg['tag_name'] + '.txt') print(msg, flush=True) if not cfg['input_dir'] and cfg['output_dir']: # download full images from api output_dir = fix_input_path(cfg['output_dir']) api_workflow_client.download_dataset(output_dir, tag_name=tag_name) elif cfg['input_dir'] and cfg['output_dir']: input_dir = fix_input_path(cfg['input_dir']) output_dir = fix_input_path(cfg['output_dir']) print(f'Copying files from {input_dir} to {output_dir}.') # create a dataset from the input directory dataset = data.LightlyDataset(input_dir=input_dir) # dump the dataset in the output directory dataset.dump(output_dir, samples)
def t_est_active_learning(api_workflow_client: ApiWorkflowClient, method: SamplingMethod = SamplingMethod.CORAL, query_tag_name: str = 'initial-tag', preselected_tag_name: str = None, n_samples_additional: List[int] = [2, 5]): # create the tags with 100 respectively 10 samples if not yet existant if query_tag_name is not None: sampler_config = SamplerConfig(method=SamplingMethod.RANDOM, n_samples=100, name=query_tag_name) try: api_workflow_client.sampling(sampler_config=sampler_config) except RuntimeError: pass if preselected_tag_name is not None: sampler_config = SamplerConfig(method=SamplingMethod.RANDOM, n_samples=10, name=preselected_tag_name) try: api_workflow_client.sampling(sampler_config=sampler_config) except RuntimeError: pass # define the active learning agent agent = ActiveLearningAgent(api_workflow_client, query_tag_name=query_tag_name, preselected_tag_name=preselected_tag_name) total_no_samples = len(agent.unlabeled_set) + len(agent.labeled_set) al_scorer = None for iteration, n_samples_additional in enumerate(n_samples_additional): n_samples = len(agent.labeled_set) + n_samples_additional print( f"Beginning with iteration {iteration} to have {n_samples} labeled samples." ) # Perform a sampling method_here = SamplingMethod.CORESET if iteration == 0 and method == SamplingMethod.CORAL else method sampler_config = SamplerConfig(method=method_here, n_samples=n_samples) if al_scorer is None: agent.query(sampler_config=sampler_config) else: agent.query(sampler_config=sampler_config, al_scorer=al_scorer) assert len(agent.labeled_set) == n_samples assert len(agent.unlabeled_set) == total_no_samples - n_samples # Update the scorer n_samples = len(agent.query_set) n_classes = 10 predictions = np.random.rand(n_samples, n_classes) predictions_normalized = predictions / np.sum(predictions, axis=1)[:, np.newaxis] model_output = predictions_normalized al_scorer = ScorerClassification(model_output=predictions) print("Success!")
def _upload_cli(cfg, is_cli_call=True): input_dir = cfg['input_dir'] if input_dir and is_cli_call: input_dir = fix_input_path(input_dir) path_to_embeddings = cfg['embeddings'] if path_to_embeddings and is_cli_call: path_to_embeddings = fix_input_path(path_to_embeddings) dataset_id = cfg['dataset_id'] token = cfg['token'] new_dataset_name = cfg['new_dataset_name'] cli_api_args_wrong = False if not token: print_as_warning('Please specify your access token.') cli_api_args_wrong = True dataset_id_ok = dataset_id and len(dataset_id) > 0 new_dataset_name_ok = new_dataset_name and len(new_dataset_name) > 0 if new_dataset_name_ok and not dataset_id_ok: api_workflow_client = ApiWorkflowClient(token=token) api_workflow_client.create_dataset(dataset_name=new_dataset_name) elif dataset_id_ok and not new_dataset_name_ok: api_workflow_client = ApiWorkflowClient(token=token, dataset_id=dataset_id) else: print_as_warning('Please specify either the dataset_id of an existing dataset or a new_dataset_name.') cli_api_args_wrong = True if cli_api_args_wrong: print_as_warning('For help, try: lightly-upload --help') return size = cfg['resize'] if not isinstance(size, int): size = tuple(size) transform = None if isinstance(size, tuple) or size > 0: transform = torchvision.transforms.Resize(size) if input_dir: mode = cfg['upload'] dataset = LightlyDataset(input_dir=input_dir, transform=transform) api_workflow_client.upload_dataset( input=dataset, mode=mode, max_workers=cfg['loader']['num_workers'] ) print(f"Finished the upload of the dataset.") if path_to_embeddings: name = cfg['embedding_name'] print("Starting upload of embeddings.") api_workflow_client.upload_embeddings( path_to_embeddings_csv=path_to_embeddings, name=name ) print("Finished upload of embeddings.") if new_dataset_name_ok: print(f'The dataset_id of the newly created dataset is ' f'{bcolors.OKBLUE}{api_workflow_client.dataset_id}{bcolors.ENDC}')
def create_new_dataset_with_embeddings(path_to_dataset: str, token: str, dataset_name: str) -> ApiWorkflowClient: api_workflow_client = ApiWorkflowClient(token=token) # create the dataset api_workflow_client.create_new_dataset_with_unique_name(dataset_basename=dataset_name) # upload to the dataset api_workflow_client.upload_dataset(input=path_to_dataset) # calculate and save the embeddings path_to_embeddings_csv = f"{path_to_dataset}/embeddings.csv" if not os.path.isfile(path_to_embeddings_csv): dataset = LightlyDataset(input_dir=path_to_dataset) embeddings = np.random.normal(size=(len(dataset.dataset.samples), 32)) filepaths, labels = zip(*dataset.dataset.samples) filenames = [filepath[len(path_to_dataset):].lstrip('/') for filepath in filepaths] print("Starting save of embeddings") save_embeddings(path_to_embeddings_csv, embeddings, labels, filenames) print("Finished save of embeddings") # upload the embeddings api_workflow_client.upload_embeddings(path_to_embeddings_csv=path_to_embeddings_csv, name="embedding_1") return api_workflow_client
def _upload_cli(cfg, is_cli_call=True): input_dir = cfg['input_dir'] if input_dir and is_cli_call: input_dir = fix_input_path(input_dir) path_to_embeddings = cfg['embeddings'] if path_to_embeddings and is_cli_call: path_to_embeddings = fix_input_path(path_to_embeddings) dataset_id = cfg['dataset_id'] token = cfg['token'] new_dataset_name = cfg['new_dataset_name'] if not token: warnings.warn('Please specify your access token. For help, try: lightly-upload --help') return dataset_id_ok = dataset_id and len(dataset_id) > 0 new_dataset_name_ok = new_dataset_name and len(new_dataset_name) > 0 if new_dataset_name_ok and not dataset_id_ok: api_workflow_client = ApiWorkflowClient(token=token) api_workflow_client.create_dataset(dataset_name=new_dataset_name) elif dataset_id_ok and not new_dataset_name_ok: api_workflow_client = ApiWorkflowClient(token=token, dataset_id=dataset_id) else: warnings.warn('Please specify either the dataset_id of an existing dataset or a new_dataset_name. ' 'For help, try: lightly-upload --help') return size = cfg['resize'] if not isinstance(size, int): size = tuple(size) transform = None if isinstance(size, tuple) or size > 0: transform = torchvision.transforms.Resize(size) if input_dir: mode = cfg['upload'] dataset = LightlyDataset(input_dir=input_dir, transform=transform) api_workflow_client.upload_dataset( input=dataset, mode=mode, max_workers=cfg['loader']['num_workers'] ) if path_to_embeddings: name = cfg['embedding_name'] api_workflow_client.upload_embeddings( path_to_embeddings_csv=path_to_embeddings, name=name )
def __init__(self, *args, **kwargs): lightly.api.api_workflow_client.ApiClient = MockedApiClient lightly.api.version_checking.VersioningApi = MockedVersioningApi ApiWorkflowClient.__init__(self, *args, **kwargs) self.samplings_api = MockedSamplingsApi(api_client=self.api_client) self.jobs_api = MockedJobsApi(api_client=self.api_client) self.tags_api = MockedTagsApi(api_client=self.api_client) self.embeddings_api = MockedEmbeddingsApi(api_client=self.api_client) self.mappings_api = MockedMappingsApi(api_client=self.api_client) self.scores_api = MockedScoresApi(api_client=self.api_client) self.samples_api = MockedSamplesApi(api_client=self.api_client) self.datasets_api = MockedDatasetsApi(api_client=self.api_client) self.quota_api = MockedQuotaApi(api_client=self.api_client) lightly.api.api_workflow_client.put_request = mocked_put_request self.wait_time_till_next_poll = 0.001 # for api_workflow_sampling
def create_new_dataset_with_embeddings(path_to_dataset: str, token: str, dataset_name: str) -> ApiWorkflowClient: api_workflow_client = ApiWorkflowClient(token=token) # create the dataset api_workflow_client.create_new_dataset_with_unique_name(dataset_basename=dataset_name) # upload to the dataset initialize(config_path="../../lightly/cli/config", job_name="test_app") cfg = compose(config_name="config", overrides=[ f"input_dir='{path_to_dataset}'", f"token='{token}'", f"dataset_id={api_workflow_client.dataset_id}" ]) upload_cli(cfg) # calculate and save the embeddings path_to_embeddings_csv = f"{path_to_dataset}/embeddings.csv" if not os.path.isfile(path_to_embeddings_csv): dataset = LightlyDataset(input_dir=path_to_dataset) embeddings = np.random.normal(size=(len(dataset.dataset.samples), 32)) filepaths, labels = zip(*dataset.dataset.samples) filenames = [filepath[len(path_to_dataset):].lstrip('/') for filepath in filepaths] print("Starting save of embeddings") save_embeddings(path_to_embeddings_csv, embeddings, labels, filenames) print("Finished save of embeddings") # upload the embeddings print("Starting upload of embeddings.") api_workflow_client.upload_embeddings(path_to_embeddings_csv=path_to_embeddings_csv, name="embedding_1") print("Finished upload of embeddings.") return api_workflow_client
def _download_cli(cfg, is_cli_call=True): tag_name = str(cfg['tag_name']) dataset_id = str(cfg['dataset_id']) token = str(cfg['token']) if not tag_name or not token or not dataset_id: print_as_warning('Please specify all of the parameters tag_name, token and dataset_id') print_as_warning('For help, try: lightly-download --help') return api_workflow_client = ApiWorkflowClient( token=token, dataset_id=dataset_id ) # get tag id tag_data = api_workflow_client.get_tag_by_name(tag_name) filenames_tag = api_workflow_client.get_filenames_in_tag( tag_data, exclude_parent_tag=cfg['exclude_parent_tag'], ) # store sample names in a .txt file filename = tag_name + '.txt' with open(filename, 'w') as f: for item in filenames_tag: f.write("%s\n" % item) filepath = os.path.join(os.getcwd(), filename) msg = f'The list of files in tag {cfg["tag_name"]} is stored at: {bcolors.OKBLUE}{filepath}{bcolors.ENDC}' print(msg, flush=True) if not cfg['input_dir'] and cfg['output_dir']: # download full images from api output_dir = fix_input_path(cfg['output_dir']) api_workflow_client.download_dataset(output_dir, tag_name=tag_name) elif cfg['input_dir'] and cfg['output_dir']: input_dir = fix_input_path(cfg['input_dir']) output_dir = fix_input_path(cfg['output_dir']) print(f'Copying files from {input_dir} to {bcolors.OKBLUE}{output_dir}{bcolors.ENDC}.') # create a dataset from the input directory dataset = data.LightlyDataset(input_dir=input_dir) # dump the dataset in the output directory dataset.dump(output_dir, filenames_tag)
def _upload_cli(cfg, is_cli_call=True): input_dir = cfg['input_dir'] if input_dir and is_cli_call: input_dir = fix_input_path(input_dir) path_to_embeddings = cfg['embeddings'] if path_to_embeddings and is_cli_call: path_to_embeddings = fix_input_path(path_to_embeddings) dataset_id = cfg['dataset_id'] token = cfg['token'] size = cfg['resize'] if not isinstance(size, int): size = tuple(size) transform = None if isinstance(size, tuple) or size > 0: transform = torchvision.transforms.Resize(size) if not token or not dataset_id: print('Please specify your access token and dataset id.') print('For help, try: lightly-upload --help') return api_workflow_client = ApiWorkflowClient(token=token, dataset_id=dataset_id) if input_dir: mode = cfg['upload'] dataset = LightlyDataset(input_dir=input_dir, transform=transform) api_workflow_client.upload_dataset(input=dataset, mode=mode) if path_to_embeddings: name = cfg['embedding_name'] api_workflow_client.upload_embeddings( path_to_embeddings_csv=path_to_embeddings, name=name)
def _upload_cli(cfg, is_cli_call=True): input_dir = cfg['input_dir'] if input_dir and is_cli_call: input_dir = fix_input_path(input_dir) path_to_embeddings = cfg['embeddings'] if path_to_embeddings and is_cli_call: path_to_embeddings = fix_input_path(path_to_embeddings) dataset_id = cfg['dataset_id'] token = cfg['token'] new_dataset_name = cfg['new_dataset_name'] cli_api_args_wrong = False if not token: print_as_warning('Please specify your access token.') cli_api_args_wrong = True if dataset_id: if new_dataset_name: print_as_warning( 'Please specify either the dataset_id of an existing dataset ' 'or a new_dataset_name, but not both.') cli_api_args_wrong = True else: api_workflow_client = \ ApiWorkflowClient(token=token, dataset_id=dataset_id) else: if new_dataset_name: api_workflow_client = ApiWorkflowClient(token=token) api_workflow_client.create_dataset(dataset_name=new_dataset_name) else: print_as_warning( 'Please specify either the dataset_id of an existing dataset ' 'or a new_dataset_name.') cli_api_args_wrong = True # delete the dataset_id as it might be an empty string # Use api_workflow_client.dataset_id instead del dataset_id if cli_api_args_wrong: print_as_warning('For help, try: lightly-upload --help') return # potentially load custom metadata custom_metadata = None if cfg['custom_metadata']: path_to_custom_metadata = fix_input_path(cfg['custom_metadata']) print('Loading custom metadata from ' f'{bcolors.OKBLUE}{path_to_custom_metadata}{bcolors.ENDC}') with open(path_to_custom_metadata, 'r') as f: custom_metadata = json.load(f) # set the number of workers if unset if cfg['loader']['num_workers'] < 0: # set the number of workers to the number of CPUs available, # but minimum of 8 num_workers = max(8, cpu_count()) num_workers = min(32, num_workers) cfg['loader']['num_workers'] = num_workers size = cfg['resize'] if not isinstance(size, int): size = tuple(size) transform = None if isinstance(size, tuple) or size > 0: transform = torchvision.transforms.Resize(size) if input_dir: mode = cfg['upload'] dataset = LightlyDataset(input_dir=input_dir, transform=transform) api_workflow_client.upload_dataset( input=dataset, mode=mode, max_workers=cfg['loader']['num_workers'], custom_metadata=custom_metadata, ) print('Finished the upload of the dataset.') if path_to_embeddings: name = cfg['embedding_name'] print('Starting upload of embeddings.') api_workflow_client.upload_embeddings( path_to_embeddings_csv=path_to_embeddings, name=name) print('Finished upload of embeddings.') if custom_metadata is not None and not input_dir: # upload custom metadata separately api_workflow_client.upload_custom_metadata( custom_metadata, verbose=True, max_workers=cfg['loader']['num_workers'], ) if new_dataset_name: print( f'The dataset_id of the newly created dataset is ' f'{bcolors.OKBLUE}{api_workflow_client.dataset_id}{bcolors.ENDC}') os.environ[cfg['environment_variable_names'] ['lightly_last_dataset_id']] = api_workflow_client.dataset_id
def _download_cli(cfg, is_cli_call=True): tag_name = cfg['tag_name'] dataset_id = cfg['dataset_id'] token = cfg['token'] if not tag_name or not token or not dataset_id: print_as_warning( 'Please specify all of the parameters tag_name, token and dataset_id' ) print_as_warning('For help, try: lightly-download --help') return api_workflow_client = ApiWorkflowClient(token=token, dataset_id=dataset_id) # get tag id tag_name_id_dict = dict([tag.name, tag.id] for tag in api_workflow_client._get_all_tags()) tag_id = tag_name_id_dict.get(tag_name, None) if tag_id is None: warnings.warn(f'The specified tag {tag_name} does not exist.') return # get tag data tag_data: TagData = api_workflow_client.tags_api.get_tag_by_tag_id( dataset_id=dataset_id, tag_id=tag_id) if cfg["exclude_parent_tag"]: parent_tag_id = tag_data.prev_tag_id tag_arithmetics_request = TagArithmeticsRequest( tag_id1=tag_data.id, tag_id2=parent_tag_id, operation=TagArithmeticsOperation.DIFFERENCE) bit_mask_response: TagBitMaskResponse \ = api_workflow_client.tags_api.perform_tag_arithmetics(body=tag_arithmetics_request, dataset_id=dataset_id) bit_mask_data = bit_mask_response.bit_mask_data else: bit_mask_data = tag_data.bit_mask_data # get samples chosen_samples_ids = BitMask.from_hex(bit_mask_data).to_indices() samples = [ api_workflow_client.filenames_on_server[i] for i in chosen_samples_ids ] # store sample names in a .txt file filename = cfg['tag_name'] + '.txt' with open(filename, 'w') as f: for item in samples: f.write("%s\n" % item) filepath = os.path.join(os.getcwd(), filename) msg = f'The list of files in tag {cfg["tag_name"]} is stored at: {bcolors.OKBLUE}{filepath}{bcolors.ENDC}' print(msg, flush=True) if not cfg['input_dir'] and cfg['output_dir']: # download full images from api output_dir = fix_input_path(cfg['output_dir']) api_workflow_client.download_dataset(output_dir, tag_name=tag_name) elif cfg['input_dir'] and cfg['output_dir']: input_dir = fix_input_path(cfg['input_dir']) output_dir = fix_input_path(cfg['output_dir']) print( f'Copying files from {input_dir} to {bcolors.OKBLUE}{output_dir}{bcolors.ENDC}.' ) # create a dataset from the input directory dataset = data.LightlyDataset(input_dir=input_dir) # dump the dataset in the output directory dataset.dump(output_dir, samples)
return features_array def get_labels(self, filenames: List[str]) -> np.ndarray: labels = np.array( [self.dataset[filename][1] for filename in filenames]) return labels # %% # First we read the variables we set before as environment variables via the console token = os.getenv("LIGHTLY_TOKEN", default="YOUR_TOKEN") path_to_embeddings_csv = os.getenv("LIGHTLY_EMBEDDINGS_CSV", default="path_to_your_embeddings_csv") # We define the client to the Lightly Platform API api_workflow_client = ApiWorkflowClient(token=token) api_workflow_client.create_dataset( dataset_name="active_learning_clothing_dataset") # %% # We define the dataset, the classifier and the active learning agent dataset = CSVEmbeddingDataset(path_to_embeddings_csv=path_to_embeddings_csv) classifier = LogisticRegression(max_iter=1000) agent = ActiveLearningAgent(api_workflow_client=api_workflow_client) # %% # 1. Choose an initial subset of your dataset. # We want to start with 200 samples and use the CORESET sampler for sampling them. print("Starting the initial sampling") sampler_config = SamplerConfig(n_samples=200, method=SamplingMethod.CORESET,
for filename, embedding_row, label in zip(filenames, embeddings, labels)]) def get_features(self, filenames: List[str]) -> np.ndarray: features_array = np.array( [self.dataset[filename][0] for filename in filenames]) return features_array def get_labels(self, filenames: List[str]) -> np.ndarray: labels = np.array( [self.dataset[filename][1] for filename in filenames]) return labels # %% # Upload the embeddings to the lightly web platform api_workflow_client = ApiWorkflowClient(token=YOUR_TOKEN, dataset_id=YOUR_DATASET_ID) api_workflow_client.upload_embeddings( name="embedding-1", path_to_embeddings_csv=path_to_embeddings_csv) # %% # Define the dataset for the classifer, the classifier and the active learning agent dataset = CSVEmbeddingDataset(path_to_embeddings_csv=path_to_embeddings_csv) classifier = KNeighborsClassifier(n_neighbors=20, weights='distance') agent = ActiveLearningAgent(api_workflow_client=api_workflow_client) # %% # 1. Choose an initial subset of your dataset. # We want to start with 100 samples and use the CORESET sampler for sampling them. print("Starting the initial sampling") sampler_config = SamplerConfig(n_samples=100, method=SamplingMethod.CORESET,
# ---------------------------- # # In active learning, we want to pick the new data for which our model struggles # the most. If we have an image with a single car in it and our model has # high confidence that there is a car we don't gain a lot by including # this example in our training data. However, if we focus on images where the # model is not sure whether the object is a car or a building we want # to include these images to refine the decision boundary. # # First, we need to create an active learning agent in order to # provide lightly with the model predictions. # We can use the ApiWorkflowClient for this. Make sure that we use the # right dataset_id and token. # create Lightly API client api_client = ApiWorkflowClient(dataset_id=YOUR_DATASET_ID, token=YOUR_TOKEN) al_agent = ActiveLearningAgent(api_client) # %% # we can access the images of the dataset we want to use for active learning using # the `al_agent.query_set` property # let's print the first 3 entries print(al_agent.query_set[:3]) # %% # Note, that our active learning agent already synchronized with the Lightly # Platform and knows the filenames present in our dataset. # # Let's verify the length of the `query_set`. The `query_set` is the set of