def create_embeddings(model, dataset_name, embeddings_name, batch_size, workers, n_gpu): """Creates embedding vectors for each element in the given DataSet, and saves a single Pytorch tensor of shape `len(dataset), embeddings_size` with all the embeddings in a file with the given name. Args: model: name of the model to be used for embedding the DataSet. dataset_name: name of the registered dataset which will be embedded. embeddings_name: the name of the pickle directory where the embeddings will be saved. batch_size: size of batches for the embedding process. workers: number of data loader workers. n_gpu: number of available GPUs. If zero, the CPU will be used. Returns: """ device = get_device(n_gpu) model = model.eval().to(device) dataset = get_dataset(dataset_name) data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=workers) embedding_list = [ model(data[0].to(device) ) # model output sent to 'cpu' to prevent gpu memory overflow for data in tqdm( data_loader, total=len(data_loader), desc="Embedding data") ] torch.save( torch.cat(embedding_list).to("cpu"), open(get_path("embeddings", "{}.pt".format(embeddings_name)), "wb"))
def cache_file_path(self) -> str: """File path of a `MultimodalEntityDataset` cache file. Returns: The file path of the cache file. """ return get_path("cache", self.cache_filename)
def image(path: str, tag: List[str]) -> None: """Add the image in the given path to Tensorboard.""" from PIL import Image image_name = os.path.basename(path) transform = transforms.ToTensor() writer = SummaryWriter(get_path("tensorboard", "images", *tag)) writer.add_image(image_name, transform(Image.open(path)), 0) writer.close() click.echo("Image added to Tensorboard: {}".format(image_name))
def load_embeddings(embeddings_name): """Loads an embedding directory composed of pickled Tensors with image embeddings for a batch. Args: embeddings_name: the name of the pickle directory where the embeddings are saved. Returns: a Pytorch tensor with all the embeddings found in the provided embedding directory. The later must contain pickled tensor objects with image embeddings. """ return torch.load(open( get_path("embeddings", "{}.pt".format(embeddings_name)), "rb"), map_location=get_map_location())
def plot_embedding_tsne(dataset_name, embeddings_name, load_projection=False): """Plot a 2D projection of embeddings in the specified embedding directory using plotly. Args: dataset_name: name of the registered dataset which will be embedded. embeddings_name: the name of the directory where the batch pickles will be saved. load_projection: load projections from pickles. (Default value = False) Returns: """ from vscvs.embeddings import load_embeddings # import here to avoid circular import dataset = get_dataset(dataset_name) embeddings = load_embeddings(embeddings_name).to("cpu") projection_pickle_dir = get_path("embeddings", embeddings_name) if load_projection: click.echo("Loading existing 2D projection from pickle.") projection = pickle.load( open(os.path.join(projection_pickle_dir, "tsne.pickle"), "rb")) dataset_class_names = pickle.load( open( os.path.join(projection_pickle_dir, "tsne_class_names.pickle"), "rb")) else: click.echo("Creating 2D projection of the embeddings using TSNE") projection = TSNE(n_components=2).fit_transform(embeddings) dataset_class_names = [ dataset.classes[tup[1]] for tup in tqdm(dataset, desc="Retrieving image class names") ] pickle.dump( projection, open(os.path.join(projection_pickle_dir, "tsne.pickle"), "wb")) pickle.dump( dataset_class_names, open( os.path.join(projection_pickle_dir, "tsne_class_names.pickle"), "wb")) trace = go.Scattergl( # plot the resulting projection using plotly x=projection[:, 0], y=projection[:, 1], text=dataset_class_names, mode="markers", marker=dict(size=16, color=np.random.randn(len(projection)), colorscale="Viridis"), ) data = [trace] plot(data)
def embeddings( dataset_name: str, embeddings_name: str, metadata: str, tag: List[str] ) -> None: """Add the embeddings in the given path to Tensorboard.""" from pandas import read_csv from vscvs.utils import sprite_tensor classes = read_csv(metadata, delimiter="\t")["class"] embeddings_tensor = load_embeddings(embeddings_name) writer = SummaryWriter(get_path("tensorboard", "embeddings", embeddings_name)) writer.add_embedding( embeddings_tensor, metadata=classes, tag="/".join((embeddings_name,) + tag), label_img=sprite_tensor(dataset_name), ) writer.close() click.echo("Embeddings added to Tensorboard: {}".format(embeddings_name))
def create_metadata_tsv(dataset_name): """Creates a metadata TSV file that can be used with any embeddings of the given dataset to be displayed in the Tensorboard embedding projector. Args: dataset_name: name of the registered DatasetFolder which will be embedded. Returns: """ dataset = get_dataset(dataset_name) with open(get_path("embeddings", "{}.tsv".format(dataset_name)), "w") as f: writer = csv.DictWriter(f, ["class_idx", "class", "path"], delimiter="\t") writer.writeheader() writer.writerows([{ "class_idx": idx, "class": dataset.classes[idx], "path": path } for path, idx in tqdm(dataset.samples, desc="Creating tsv")])