示例#1
0
def main(args, _=None):
    """Run ``catalyst-data project-embeddings`` script."""
    df = pd.read_csv(args.in_csv)
    os.makedirs(args.out_dir, exist_ok=True)

    if args.meta_cols is not None:
        meta_header = args.meta_cols.split(",")
    else:
        raise ValueError("meta-cols must not be None")

    features = np.load(args.in_npy, mmap_mode="r")
    assert len(df) == len(features)

    if args.num_rows is not None:
        indices = np.random.choice(len(df), args.num_rows)
        features = features[indices, :]
        df = df.iloc[indices]

    if args.img_col is not None:
        img_data = _load_image_data(
            rootpath=args.img_rootpath, paths=df[args.img_col].values
        )
    else:
        img_data = None

    summary_writer = SummaryWriter(args.out_dir)
    metadata = df[meta_header].values.tolist()
    metadata = [
        [
            str(text)
            .replace("\n", " ")
            .replace(r"\s", " ")
            .replace(r"\s\s+", " ")
            .strip()
            for text in texts
        ]
        for texts in metadata
    ]
    assert len(metadata) == len(features)
    summary_writer.add_embedding(
        features,
        metadata=metadata,
        label_img=img_data,
        metadata_header=meta_header,
    )
    summary_writer.close()

    print(
        f"Done. Run `tensorboard --logdir={args.out_dir}` "
        + "to view in Tensorboard"
    )
示例#2
0
def main(args, _=None):
    """Run ``catalyst-data project-embeddings`` script."""
    df = pd.read_csv(args.in_csv)
    os.makedirs(args.out_dir, exist_ok=True)

    if args.meta_cols is not None:
        meta_header = args.meta_cols.split(",")
    else:
        raise ValueError("meta-cols must not be None")

    features = np.load(args.in_npy, mmap_mode="r")

    if args.num_rows is not None:
        df = df.sample(n=args.num_rows)

    if args.img_col is not None:
        image_names = [
            path.join(args.img_rootpath, name)
            for name in df[args.img_col].values
        ]
        img_data = np.stack(
            [load_image(name, args.img_size) for name in image_names], axis=0)
        img_data = (
            img_data.transpose((0, 3, 1, 2)) / 255.0  # noqa: WPS432
        ).astype(np.float32)
        img_data = torch.from_numpy(img_data)
    else:
        img_data = None

    summary_writer = SummaryWriter(args.out_dir)
    summary_writer.add_embedding(
        features,
        metadata=df[meta_header].astype(str).values,
        label_img=img_data,
        metadata_header=meta_header,
    )
    summary_writer.close()

    print(f"Done. Run `tensorboard --logdir={args.out_dir}` " +
          "to view in Tensorboard")