def create_new_dataset_with_embeddings(path_to_dataset: str, token: str, dataset_name: str) -> ApiWorkflowClient: api_workflow_client = ApiWorkflowClient(token=token) # create the dataset api_workflow_client.create_new_dataset_with_unique_name(dataset_basename=dataset_name) # upload to the dataset api_workflow_client.upload_dataset(input=path_to_dataset) # calculate and save the embeddings path_to_embeddings_csv = f"{path_to_dataset}/embeddings.csv" if not os.path.isfile(path_to_embeddings_csv): dataset = LightlyDataset(input_dir=path_to_dataset) embeddings = np.random.normal(size=(len(dataset.dataset.samples), 32)) filepaths, labels = zip(*dataset.dataset.samples) filenames = [filepath[len(path_to_dataset):].lstrip('/') for filepath in filepaths] print("Starting save of embeddings") save_embeddings(path_to_embeddings_csv, embeddings, labels, filenames) print("Finished save of embeddings") # upload the embeddings api_workflow_client.upload_embeddings(path_to_embeddings_csv=path_to_embeddings_csv, name="embedding_1") return api_workflow_client
def create_new_dataset_with_embeddings(path_to_dataset: str, token: str, dataset_name: str) -> ApiWorkflowClient: api_workflow_client = ApiWorkflowClient(token=token) # create the dataset api_workflow_client.create_new_dataset_with_unique_name(dataset_basename=dataset_name) # upload to the dataset initialize(config_path="../../lightly/cli/config", job_name="test_app") cfg = compose(config_name="config", overrides=[ f"input_dir='{path_to_dataset}'", f"token='{token}'", f"dataset_id={api_workflow_client.dataset_id}" ]) upload_cli(cfg) # calculate and save the embeddings path_to_embeddings_csv = f"{path_to_dataset}/embeddings.csv" if not os.path.isfile(path_to_embeddings_csv): dataset = LightlyDataset(input_dir=path_to_dataset) embeddings = np.random.normal(size=(len(dataset.dataset.samples), 32)) filepaths, labels = zip(*dataset.dataset.samples) filenames = [filepath[len(path_to_dataset):].lstrip('/') for filepath in filepaths] print("Starting save of embeddings") save_embeddings(path_to_embeddings_csv, embeddings, labels, filenames) print("Finished save of embeddings") # upload the embeddings print("Starting upload of embeddings.") api_workflow_client.upload_embeddings(path_to_embeddings_csv=path_to_embeddings_csv, name="embedding_1") print("Finished upload of embeddings.") return api_workflow_client
def _embed_cli(cfg, is_cli_call=True): input_dir = cfg['input_dir'] if input_dir and is_cli_call: input_dir = fix_input_path(input_dir) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False if torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') transform = torchvision.transforms.Compose([ torchvision.transforms.Resize( (cfg['collate']['input_size'], cfg['collate']['input_size'])), torchvision.transforms.ToTensor(), torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) dataset = LightlyDataset(input_dir, transform=transform) # disable drop_last and shuffle cfg['loader']['drop_last'] = False cfg['loader']['shuffle'] = False cfg['loader']['batch_size'] = min(cfg['loader']['batch_size'], len(dataset)) # determine the number of available cores if cfg['loader']['num_workers'] < 0: cfg['loader']['num_workers'] = cpu_count() dataloader = torch.utils.data.DataLoader(dataset, **cfg['loader']) encoder = get_model_from_config(cfg, is_cli_call) embeddings, labels, filenames = encoder.embed(dataloader, device=device) if is_cli_call: path = os.path.join(os.getcwd(), 'embeddings.csv') save_embeddings(path, embeddings, labels, filenames) print(f'Embeddings are stored at {bcolors.OKBLUE}{path}{bcolors.ENDC}') os.environ[cfg['environment_variable_names'] ['lightly_last_embedding_path']] = path return path return embeddings, labels, filenames
def create_fake_dataset(self, n_data: int = 5, n_rows_embeddings: int = 5, n_dims_embeddings: int = 4): self.dataset = torchvision.datasets.FakeData(size=n_data, image_size=(3, 32, 32)) self.folder_path = tempfile.mkdtemp() sample_names = [f'img_{i}.jpg' for i in range(n_data)] self.sample_names = sample_names for sample_idx in range(n_data): data = self.dataset[sample_idx] path = os.path.join(self.folder_path, sample_names[sample_idx]) data[0].save(path) coco_json = {} coco_json['images'] = [{ 'id': i, 'file_name': fname } for i, fname in enumerate(self.sample_names)] coco_json['metadata'] = [{ 'id': i, 'image_id': i, 'custom_metadata': 0 } for i, _ in enumerate(self.sample_names)] self.tfile = tempfile.NamedTemporaryFile(mode="w+") json.dump(coco_json, self.tfile) self.tfile.flush() # create fake embeddings self.path_to_embeddings = os.path.join(self.folder_path, 'embeddings.csv') sample_names_embeddings = [ f'img_{i}.jpg' for i in range(n_rows_embeddings) ] labels = [0] * len(sample_names_embeddings) save_embeddings(self.path_to_embeddings, np.random.randn(n_rows_embeddings, n_dims_embeddings), labels, sample_names_embeddings)
def _embed_cli(cfg, is_cli_call=True): data = cfg['data'] train = cfg.get('train', True) checkpoint = cfg['checkpoint'] download = cfg['download'] root = cfg['root'] if root and is_cli_call: root = fix_input_path(root) input_dir = cfg['input_dir'] if input_dir and is_cli_call: input_dir = fix_input_path(input_dir) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False if torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') transform = torchvision.transforms.Compose([ torchvision.transforms.Resize( (cfg['collate']['input_size'], cfg['collate']['input_size'])), torchvision.transforms.ToTensor(), torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) dataset = LightlyDataset(root, name=data, train=train, download=download, from_folder=input_dir, transform=transform) cfg['loader']['drop_last'] = False cfg['loader']['shuffle'] = False cfg['loader']['batch_size'] = min(cfg['loader']['batch_size'], len(dataset)) dataloader = torch.utils.data.DataLoader(dataset, **cfg['loader']) # load the PyTorch state dictionary and map it to the current device state_dict = None if not checkpoint: checkpoint, key = get_ptmodel_from_config(cfg['model']) if not checkpoint: msg = 'Cannot download checkpoint for key {} '.format(key) msg += 'because it does not exist!' raise RuntimeError(msg) state_dict = load_state_dict_from_url( checkpoint, map_location=device)['state_dict'] else: checkpoint = fix_input_path(checkpoint) if is_cli_call else checkpoint state_dict = torch.load(checkpoint, map_location=device)['state_dict'] model = ResNetSimCLR(**cfg['model']).to(device) if state_dict is not None: model.load_from_state_dict(state_dict) encoder = SelfSupervisedEmbedding(model, None, None, None) embeddings, labels, filenames = encoder.embed(dataloader, device=device) if is_cli_call: path = os.path.join(os.getcwd(), 'embeddings.csv') save_embeddings(path, embeddings, labels, filenames) print('Embeddings are stored at %s' % (path)) return path return embeddings, labels, filenames
def _embed_cli(cfg, is_cli_call=True): checkpoint = cfg['checkpoint'] input_dir = cfg['input_dir'] if input_dir and is_cli_call: input_dir = fix_input_path(input_dir) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False if torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') transform = torchvision.transforms.Compose([ torchvision.transforms.Resize((cfg['collate']['input_size'], cfg['collate']['input_size'])), torchvision.transforms.ToTensor(), torchvision.transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) dataset = LightlyDataset(input_dir, transform=transform) cfg['loader']['drop_last'] = False cfg['loader']['shuffle'] = False cfg['loader']['batch_size'] = min( cfg['loader']['batch_size'], len(dataset) ) dataloader = torch.utils.data.DataLoader(dataset, **cfg['loader']) # load the PyTorch state dictionary and map it to the current device state_dict = None if not checkpoint: checkpoint, key = get_ptmodel_from_config(cfg['model']) if not checkpoint: msg = 'Cannot download checkpoint for key {} '.format(key) msg += 'because it does not exist!' raise RuntimeError(msg) state_dict = load_state_dict_from_url( checkpoint, map_location=device )['state_dict'] else: checkpoint = fix_input_path(checkpoint) if is_cli_call else checkpoint state_dict = torch.load( checkpoint, map_location=device )['state_dict'] # load model resnet = ResNetGenerator(cfg['model']['name'], cfg['model']['width']) last_conv_channels = list(resnet.children())[-1].in_features features = nn.Sequential( get_norm_layer(3, 0), *list(resnet.children())[:-1], nn.Conv2d(last_conv_channels, cfg['model']['num_ftrs'], 1), nn.AdaptiveAvgPool2d(1), ) model = SimCLR( features, num_ftrs=cfg['model']['num_ftrs'], out_dim=cfg['model']['out_dim'] ).to(device) if state_dict is not None: load_from_state_dict(model, state_dict) encoder = SelfSupervisedEmbedding(model, None, None, None) embeddings, labels, filenames = encoder.embed(dataloader, device=device) if is_cli_call: path = os.path.join(os.getcwd(), 'embeddings.csv') save_embeddings(path, embeddings, labels, filenames) print(f'Embeddings are stored at {bcolors.OKBLUE}{path}{bcolors.ENDC}') return path return embeddings, labels, filenames