Пример #1
0
def create_embeddings(model, dataset_name, embeddings_name, batch_size,
                      workers, n_gpu):
    """Creates embedding vectors for each element in the given DataSet, and saves a single Pytorch tensor of shape
    `len(dataset), embeddings_size` with all the embeddings in a file with the given name.

    Args:
      model: name of the model to be used for embedding the DataSet.
      dataset_name: name of the registered dataset which will be embedded.
      embeddings_name: the name of the pickle directory where the embeddings will be saved.
      batch_size: size of batches for the embedding process.
      workers: number of data loader workers.
      n_gpu: number of available GPUs. If zero, the CPU will be used.

    Returns:

    """
    device = get_device(n_gpu)
    model = model.eval().to(device)
    dataset = get_dataset(dataset_name)
    data_loader = DataLoader(dataset,
                             batch_size=batch_size,
                             shuffle=False,
                             num_workers=workers)
    embedding_list = [
        model(data[0].to(device)
              )  # model output sent to 'cpu' to prevent gpu memory overflow
        for data in tqdm(
            data_loader, total=len(data_loader), desc="Embedding data")
    ]
    torch.save(
        torch.cat(embedding_list).to("cpu"),
        open(get_path("embeddings", "{}.pt".format(embeddings_name)), "wb"))
Пример #2
0
    def cache_file_path(self) -> str:
        """File path of a `MultimodalEntityDataset` cache file.

        Returns:
            The file path of the cache file.
        """
        return get_path("cache", self.cache_filename)
Пример #3
0
def image(path: str, tag: List[str]) -> None:
    """Add the image in the given path to Tensorboard."""
    from PIL import Image

    image_name = os.path.basename(path)
    transform = transforms.ToTensor()
    writer = SummaryWriter(get_path("tensorboard", "images", *tag))
    writer.add_image(image_name, transform(Image.open(path)), 0)
    writer.close()
    click.echo("Image added to Tensorboard: {}".format(image_name))
Пример #4
0
def load_embeddings(embeddings_name):
    """Loads an embedding directory composed of pickled Tensors with image embeddings for a batch.

    Args:
      embeddings_name: the name of the pickle directory where the embeddings are saved.

    Returns:
      a Pytorch tensor with all the embeddings found in the provided embedding directory. The later must
      contain pickled tensor objects with image embeddings.

    """
    return torch.load(open(
        get_path("embeddings", "{}.pt".format(embeddings_name)), "rb"),
                      map_location=get_map_location())
Пример #5
0
def plot_embedding_tsne(dataset_name, embeddings_name, load_projection=False):
    """Plot a 2D projection of embeddings in the specified embedding directory using plotly.

    Args:
      dataset_name: name of the registered dataset which will be embedded.
      embeddings_name: the name of the directory where the batch pickles will be saved.
      load_projection: load projections from pickles. (Default value = False)

    Returns:

    """
    from vscvs.embeddings import load_embeddings  # import here to avoid circular import

    dataset = get_dataset(dataset_name)
    embeddings = load_embeddings(embeddings_name).to("cpu")
    projection_pickle_dir = get_path("embeddings", embeddings_name)
    if load_projection:
        click.echo("Loading existing 2D projection from pickle.")
        projection = pickle.load(
            open(os.path.join(projection_pickle_dir, "tsne.pickle"), "rb"))
        dataset_class_names = pickle.load(
            open(
                os.path.join(projection_pickle_dir, "tsne_class_names.pickle"),
                "rb"))
    else:
        click.echo("Creating 2D projection of the embeddings using TSNE")
        projection = TSNE(n_components=2).fit_transform(embeddings)
        dataset_class_names = [
            dataset.classes[tup[1]]
            for tup in tqdm(dataset, desc="Retrieving image class names")
        ]
        pickle.dump(
            projection,
            open(os.path.join(projection_pickle_dir, "tsne.pickle"), "wb"))
        pickle.dump(
            dataset_class_names,
            open(
                os.path.join(projection_pickle_dir, "tsne_class_names.pickle"),
                "wb"))
    trace = go.Scattergl(  # plot the resulting projection using plotly
        x=projection[:, 0],
        y=projection[:, 1],
        text=dataset_class_names,
        mode="markers",
        marker=dict(size=16,
                    color=np.random.randn(len(projection)),
                    colorscale="Viridis"),
    )
    data = [trace]
    plot(data)
Пример #6
0
def embeddings(
    dataset_name: str, embeddings_name: str, metadata: str, tag: List[str]
) -> None:
    """Add the embeddings in the given path to Tensorboard."""
    from pandas import read_csv
    from vscvs.utils import sprite_tensor

    classes = read_csv(metadata, delimiter="\t")["class"]
    embeddings_tensor = load_embeddings(embeddings_name)
    writer = SummaryWriter(get_path("tensorboard", "embeddings", embeddings_name))
    writer.add_embedding(
        embeddings_tensor,
        metadata=classes,
        tag="/".join((embeddings_name,) + tag),
        label_img=sprite_tensor(dataset_name),
    )
    writer.close()
    click.echo("Embeddings added to Tensorboard: {}".format(embeddings_name))
Пример #7
0
def create_metadata_tsv(dataset_name):
    """Creates a metadata TSV file that can be used with any embeddings of the given dataset to be displayed in the
    Tensorboard embedding projector.

    Args:
      dataset_name: name of the registered DatasetFolder which will be embedded.

    Returns:

    """
    dataset = get_dataset(dataset_name)
    with open(get_path("embeddings", "{}.tsv".format(dataset_name)), "w") as f:
        writer = csv.DictWriter(f, ["class_idx", "class", "path"],
                                delimiter="\t")
        writer.writeheader()
        writer.writerows([{
            "class_idx": idx,
            "class": dataset.classes[idx],
            "path": path
        } for path, idx in tqdm(dataset.samples, desc="Creating tsv")])