예제 #1
0
def convert_cifar2imagenet_format():
    print("Convert cifar10 dataset...")
    cifar_path = Path(__file__).parent.parent.absolute().joinpath("data").joinpath("cifar10")
    converted_path = Path(__file__).parent.parent.absolute().joinpath("data").joinpath("cifar10_raw").joinpath("validation")
    webdata_path = Path(__file__).parent.parent.absolute().joinpath("data").joinpath("cifar10_webdata")
    if os.path.exists(converted_path) and os.path.exists(webdata_path):
        return
    ds = torchvision.datasets.CIFAR10(
            root=cifar_path.resolve(),
            train=False,
            download=True
    )
    dl = DataLoader(ds, batch_size=None)

    # Create raw imagenet format
    if not os.path.exists(converted_path):
        os.makedirs(converted_path)
        for idx, (img, label) in enumerate(dl):
            image_folder = os.path.join(converted_path, str(label))
            if not os.path.exists(image_folder):
                os.makedirs(image_folder)
            img.save(os.path.join(converted_path, str(label), "{}.jpg".format(idx)))

    # Create webdataset format
    if not os.path.exists(webdata_path):
        os.makedirs(webdata_path)
        with wds.ShardWriter(str(webdata_path.joinpath("validation-%06d.tar")), maxcount=256) as sink:
            for index, (data, label) in enumerate(dl):
                sink.write(encode_sample(data, label, index))

        # Train dataset
        train_ds = torchvision.datasets.CIFAR10(
                    root=cifar_path.resolve(),
                    train=True,
                    download=True
                    )
        train_dl = DataLoader(train_ds, batch_size=None)
        with wds.ShardWriter(str(webdata_path.joinpath("train-%06d.tar")), maxcount=256) as sink:
            for index, (data, label) in enumerate(train_dl):
                sink.write(encode_sample(data, label, index))
        metadata = {"format": "img", "validation_length": 10000, "train_length": 60000, "validation_transform_pipeline": [], "train_transform_pipeline": []}
        with open(os.path.join(webdata_path, "metadata.json"), "w") as metafile:
            json.dump(metadata, metafile)

        # Create distributed chunks for 4 node
        class HelperClass: pass
        args = HelperClass()
        args.num_instances = 4
        args.target = webdata_path
        create_distributed_remaining(args, "validation")
예제 #2
0
파일: __init__.py 프로젝트: bariarviv/cortx
def upload_cv_dataset(ds,
                      client,
                      bucket,
                      base_folder,
                      maxsize,
                      maxcount,
                      workers=0,
                      batch_size=256):
    loader = ch.utils.data.DataLoader(Packer(ds),
                                      batch_size=batch_size,
                                      num_workers=workers,
                                      shuffle=True,
                                      collate_fn=lambda x: x)

    with tempfile.TemporaryDirectory() as tempfolder:
        pattern = path.join(tempfolder, f"shard-%06d.tar")
        writer = partial(upload_shard,
                         client=client,
                         bucket=bucket,
                         base_folder=base_folder)
        with wds.ShardWriter(pattern,
                             maxsize=int(maxsize),
                             maxcount=int(maxcount),
                             post=writer) as sink:
            for r in tqdm(loader):
                for ix, im, label in r:
                    sample = {"__key__": f"im-{ix}", "jpg": im, "cls": label}
                    sink.write(sample)
예제 #3
0
def split_tiles(
    images, masks, lus, workers: int, shardpattern: str, **kwargs
) -> List[Any]:
    """Split tile into subtiles in parallel and save them to disk"""

    valid_subtiles = kwargs.get("valid_subtiles", None)

    stats = []
    with wds.ShardWriter(shardpattern, maxcount=SHARDSIZE) as sink:

        data = process_map(
            partial(_split_tile, **kwargs),
            images,
            masks,
            lus,
            max_workers=workers,
            chunksize=1,
        )

        for sample in reduce(lambda z, y: z + y, data):
            if sample:
                if valid_subtiles:
                    if sample["__key__"] in valid_subtiles:
                        sink.write(sample)
                        stats.append((sample["__key__"], sample["txt"], "1"))
                else:
                    if float(sample["txt"]) > 0:
                        sink.write(sample)
                        stats.append((sample["__key__"], sample["txt"], "1"))
                    else:
                        # not included in shard
                        stats.append((sample["__key__"], sample["txt"], "0"))

    return stats
예제 #4
0
def write_dataset(dataloader, target_path, chunksize, transform=None, image_quality=95):
    with wds.ShardWriter(target_path, maxcount=chunksize) as sink:
        for index, (data, label) in enumerate(tqdm(dataloader)):
            if isinstance(data, tuple) or isinstance(data, list):
                bbox = data[1]
                data = data[0]
            else:
                bbox = None
            if transform is not None:
                data = transform(data)
            sink.write(encode_sample(data, label, index, bbox=bbox, image_quality=image_quality))
예제 #5
0
def write_dataset(dataloader, target_path, chunksize):
    with wds.ShardWriter(target_path, maxcount=chunksize) as sink:
        for index, (data, label) in enumerate(tqdm(dataloader)):
            if isinstance(data, Image.Image):
                sample = {"__key__": str(index), "jpg": data, "cls": label}
            else:
                sample = {
                    "__key__": str(index),
                    "pth": torch.tensor(data * 255, dtype=torch.uint8),
                    "cls": label
                }
            sink.write(sample)
예제 #6
0
def write_dataset(imagenet, base="./shards", split="train"):

    # We're using the torchvision ImageNet dataset
    # to parse the metadata; however, we will read
    # the compressed images directly from disk (to
    # avoid having to reencode them)
    ds = datasets.ImageNet(imagenet, split=split)
    nimages = len(ds.imgs)
    print("# nimages", nimages)

    # We shuffle the indexes to make sure that we
    # don't get any large sequences of a single class
    # in the dataset.
    indexes = list(range(nimages))
    random.shuffle(indexes)

    # This is the output pattern under which we write shards.
    pattern = os.path.join(base, f"imagenet-{split}-%06d.tar")

    with wds.ShardWriter(pattern,
                         maxsize=int(args.maxsize),
                         maxcount=int(args.maxcount)) as sink:
        for i in indexes:

            # Internal information from the ImageNet dataset
            # instance: the file name and the numerical class.
            fname, cls = ds.imgs[i]
            assert cls == ds.targets[i]

            # Read the JPEG-compressed image file contents.
            image = readfile(fname)

            # Construct a uniqu keye from the filename.
            key = os.path.splitext(os.path.basename(fname))[0]

            # Useful check.
            assert key not in all_keys
            all_keys.add(key)

            # Construct a sample.
            xkey = key if args.filekey else "%07d" % i
            sample = {"__key__": xkey, "jpg": image, "cls": cls}

            # Write the sample to the sharded tar archives.
            sink.write(sample)
    Path(output_path + "/val").mkdir(parents=True, exist_ok=True)
    Path(output_path + "/test").mkdir(parents=True, exist_ok=True)

    random_seed = 25
    y = trainset.targets
    trainset, valset, y_train, y_val = train_test_split(
        trainset,
        y,
        stratify=y,
        shuffle=True,
        test_size=0.2,
        random_state=random_seed)

    for name in [(trainset, "train"), (valset, "val"), (testset, "test")]:
        with wds.ShardWriter(output_path + "/" + str(name[1]) + "/" +
                             str(name[1]) + "-%d.tar",
                             maxcount=1000) as sink:
            for index, (image, cls) in enumerate(name[0]):
                sink.write({
                    "__key__": "%06d" % index,
                    "ppm": image,
                    "cls": cls
                })

    entry_point = ["ls", "-R", output_path]
    run_code = subprocess.run(entry_point, stdout=subprocess.PIPE)
    print(run_code.stdout)

    visualization_arguments = {
        "output": {
            "mlpipeline_ui_metadata": args["mlpipeline_ui_metadata"],
예제 #8
0
def write_shards(
    voxlingua_folder_path: pathlib.Path,
    shards_path: pathlib.Path,
    seed: int,
    samples_per_shard: int,
    min_dur: float,
):
    """
    Parameters
    ----------
    voxlingua_folder_path: folder where extracted voxceleb data is located
    shards_path: folder to write shards of data to
    seed: random seed used to initially shuffle data into shards
    samples_per_shard: number of data samples to store in each shards.
    """
    # make sure output folder exist
    shards_path.mkdir(parents=True, exist_ok=True)

    # find all audio files
    audio_files = sorted([f for f in voxlingua_folder_path.rglob("*.wav")])

    # create tuples (unique_sample_id, language_id, path_to_audio_file, duration)
    data_tuples = []

    # track statistics on data
    all_language_ids = set()
    sample_keys_per_language = defaultdict(list)

    for f in audio_files:
        # path should be
        # voxlingua107_folder_path/<LANG_ID>/<VIDEO---0000.000-0000.000.wav>
        m = re.match(
            r"(.*/((.+)/.+---(\d\d\d\d\.\d\d\d)-(\d\d\d\d\.\d\d\d))\.wav)",
            f.as_posix(),
        )
        if m:
            loc = m.group(1)
            key = m.group(2)
            lang = m.group(3)
            start = float(m.group(4))
            end = float(m.group(5))
            dur = end - start
            # Period is not allowed in a WebDataset key name
            key = key.replace(".", "_")
            if dur > min_dur:
                # store statistics
                all_language_ids.add(lang)
                sample_keys_per_language[lang].append(key)
                t = (key, lang, loc, dur)
                data_tuples.append(t)
        else:
            raise Exception("Unexpected wav name: " + f)

    all_language_ids = sorted(all_language_ids)

    # write a meta.json file which contains statistics on the data
    # which will be written to shards
    meta_dict = {
        "language_ids": list(all_language_ids),
        "sample_keys_per_language": sample_keys_per_language,
        "num_data_samples": len(data_tuples),
    }

    with (shards_path / "meta.json").open("w") as f:
        json.dump(meta_dict, f)

    # shuffle the tuples so that each shard has a large variety in languages
    random.seed(seed)
    random.shuffle(data_tuples)

    # write shards
    all_keys = set()
    shards_path.mkdir(exist_ok=True, parents=True)
    pattern = str(shards_path / "shard") + "-%06d.tar"

    with wds.ShardWriter(pattern, maxcount=samples_per_shard) as sink:
        for key, language_id, f, duration in data_tuples:

            # load the audio tensor
            tensor = load_audio(f)

            # verify key is unique
            assert key not in all_keys
            all_keys.add(key)

            # extract language_id, youtube_id and utterance_id from key
            # language_id = all_language_ids[language_id_idx]

            # create sample to write
            sample = {
                "__key__": key,
                "audio.pth": tensor,
                "language_id": language_id,
            }

            # write sample to sink
            sink.write(sample)
예제 #9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("image_dir", type=Path)
    parser.add_argument("mask_dir", type=Path)
    parser.add_argument("lu_dir", type=Path)
    parser.add_argument("outdir", type=Path)

    num_cores = psutil.cpu_count(logical=False)
    parser.add_argument(
        "--workers",
        dest="workers",
        type=int,
        default=num_cores,
        help="number of workers for parallel execution [def: %(default)s]",
    )

    parser.add_argument(
        "--source_dim",
        dest="source_dim",
        type=int,
        default=2048,
        help="size of input tiles [def: %(default)s]",
    )

    parser.add_argument(
        "--tile_size",
        dest="tile_size",
        type=int,
        default=256,
        help="size of final tiles that are then passed to the model [def: %(default)s]",
    )

    parser.add_argument(
        "--format",
        dest="format",
        type=str,
        default="TIFF",
        choices=["PNG", "TIFF"],
        help="target file format (PNG, TIFF) [def: %(default)s]",
    )

    parser.add_argument(
        "--tmp-dir",
        dest="tmp_dir",
        type=Path,
        default=None,
        help="use this location as tmp dir",
    )

    parser.add_argument(
        "--subdir",
        dest="sub_dir",
        default="train",
        help="use this location as sub_dir",
    )

    parser.add_argument(
        "--stats",
        dest="stats_file",
        type=Path,
        default=Path("stats.csv"),
        help="use this file to record stats",
    )

    args = parser.parse_args()

    args.outdir.mkdir(parents=True, exist_ok=True)
    Path(args.outdir / args.sub_dir).mkdir(parents=True, exist_ok=True)

    if args.tmp_dir:
        print(f"Using custom tmp dir: {args.tmp_dir}")
        Path(args.tmp_dir).mkdir(parents=True, exist_ok=True)

    if args.format == "TIFF":
        suffix = "tif"
    elif args.format == "PNG":
        suffix = "png"
    else:
        raise NotImplementedError

    SHUFFLE = True  # shuffle subtile order within shards (with fixed seed)

    # subtile_stats = split_tiles(train_files)
    images = sorted(args.image_dir.glob("*.tif"))
    masks = sorted(args.mask_dir.glob("*.tif"))
    lus = sorted(args.lu_dir.glob("*.tif"))

    image_names = {i.name for i in images}
    mask_names = {i.name for i in masks}
    lu_names = {i.name for i in lus}

    # limit set of images to images that have equivalent mask tiles
    train_images = [
        i
        for i in images
        if i.name in image_names.intersection(mask_names).intersection(lu_names)
    ]
    train_masks = [
        i
        for i in masks
        if i.name in mask_names.intersection(image_names).intersection(lu_names)
    ]
    train_lus = [
        i
        for i in lus
        if i.name in lu_names.intersection(mask_names).intersection(image_names)
    ]

    train_images = sorted(train_images)
    train_masks = sorted(train_masks)
    train_lus = sorted(train_lus)

    # print(len(train_images))
    # print(len(train_masks))
    # exit()
    # print(len(train_lus))

    cfg = dict(
        source_dim=args.source_dim,
        tile_size=args.tile_size,
        format=args.format,
    )

    subtile_stats = split_tiles(
        train_images,
        train_masks,
        train_lus,
        args.workers,
        str(args.outdir / args.sub_dir / "train-%06d.tar"),
        **cfg,
    )

    with open(args.outdir / args.stats_file, "w") as fout:
        fout.write("tile,frac,status\n")
        for i, (fname, frac, status) in enumerate(subtile_stats):
            line = f"{fname},{frac},{status}\n"
            fout.write(line)

    # rebalance shards so we get similar distributions in all shards
    with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmpdir:
        print(f"Created a temporary directory: {tmpdir}")

        print("Extract source tars")
        # untar input
        for tf_name in sorted((args.outdir / args.sub_dir).glob("train-00*.tar")):
            with tarfile.open(tf_name) as tf:
                tf.extractall(tmpdir)

        print("Write balanced shards from deadtree samples")
        df = pd.read_csv(args.outdir / args.stats_file)

        df = df[df.status > 0]

        n_valid = len(df)
        splits = split_df(df, SHARDSIZE)

        # preserve last shard if more than 50% of values are present
        if SHARDSIZE // 2 < len(splits[-1]) < SHARDSIZE:
            # fill last shard with duplicates (not ideal...)
            n_missing = SHARDSIZE - len(splits[-1])
            # df_extra = splits[-1].sample(n=n_missing, random_state=42)
            splits[-1].extend(np.random.choice(splits[-1], size=n_missing).tolist())

        # drop incomplete shards
        splits = [x for x in splits if len(x) == SHARDSIZE]
        assert len(splits) > 0, "Something went wrong"

        for s_cnt, s in enumerate(splits):

            with tarfile.open(
                args.outdir / args.sub_dir / f"train-balanced-{s_cnt:06}.tar", "w"
            ) as dst:

                if SHUFFLE:
                    random.shuffle(s)
                for i in s:
                    dst.add(f"{tmpdir}/{i}.mask.{suffix}", f"{i}.mask.{suffix}")
                    dst.add(f"{tmpdir}/{i}.lu.{suffix}", f"{i}.lu.{suffix}")
                    dst.add(f"{tmpdir}/{i}.rgbn.{suffix}", f"{i}.rgbn.{suffix}")
                    dst.add(f"{tmpdir}/{i}.txt", f"{i}.txt")

    # create sets for random tile dataset
    # use all subtiles not covered in train

    n_subtiles = (args.source_dim // args.tile_size) ** 2

    all_subtiles = []
    for image_name in image_names:
        all_subtiles.extend(
            [f"{Path(image_name).stem}_{c:03}" for c in range(n_subtiles)]
        )
    all_subtiles = set(all_subtiles)

    n_samples = n_valid * OVERSAMPLE_FACTOR
    random_subtiles = random.sample(
        tuple(all_subtiles - set([x[0] for x in subtile_stats if int(x[2]) == 1])),
        n_samples,
    )

    # the necessary tile to process
    random_tiles = sorted(list(set([x[:-4] for x in random_subtiles])))

    all_images = sorted(args.image_dir.glob("*.tif"))
    random_images = [x for x in all_images if x.stem in random_tiles]

    print("STATS")
    print(len(all_subtiles))
    print(len(subtile_stats))
    print(len(random_subtiles))
    print(len(random_images))

    cfg = dict(
        source_dim=args.source_dim,
        tile_size=args.tile_size,
        format=args.format,
        valid_subtiles=random_subtiles,  # subset data with random selection of subtiles
    )

    random_images_names = {i.name for i in random_images}
    random_lus = [i for i in lus if i.name in random_images_names]

    subtile_stats_rnd = split_tiles(
        random_images,
        [None] * len(random_images),
        random_lus,
        args.workers,
        str(args.outdir / args.sub_dir / "train-randomsamples-%06d.tar"),
        **cfg,
    )

    stats_file_rnd = Path(args.stats_file.stem + "_rnd.csv")
    with open(args.outdir / stats_file_rnd, "w") as fout:
        fout.write("tile,frac,status\n")
        for i, (fname, frac, status) in enumerate(subtile_stats_rnd):
            line = f"{fname},{frac},{status}\n"
            fout.write(line)

    # also create combo dataset
    # source A: train-balanced, source B: randomsample
    # NOTE: combo dataset has double the default shardsize (2*128), samples alternate between regular and random sample
    train_balanced_shards = [
        str(x) for x in sorted((args.outdir / args.sub_dir).glob("train-balanced*"))
    ]
    train_balanced_shards_rnd = [
        str(x) for x in sorted((args.outdir / args.sub_dir).glob("train-random*"))
    ]
    train_balanced_shards_rnd = train_balanced_shards_rnd[: len(train_balanced_shards)]

    shardpattern = str(args.outdir / args.sub_dir / "train-combo-%06d.tar")

    with wds.ShardWriter(shardpattern, maxcount=SHARDSIZE * 2) as sink:
        for shardA, shardB in zip(train_balanced_shards, train_balanced_shards_rnd):

            for sA, sB in zip(wds.WebDataset(shardA), wds.WebDataset(shardB)):
                sink.write(sA)
                sink.write(sB)

    # remove everything but train & combo
    for filename in (args.outdir / args.sub_dir).glob("train-random*"):
        filename.unlink()
    for filename in (args.outdir / args.sub_dir).glob("train-balanced*"):
        filename.unlink()
    for filename in (args.outdir / args.sub_dir).glob("train-0*"):
        filename.unlink()
    if not train.exists():
        train.mkdir()
    if not test.exists():
        test.mkdir()
    org1 = Path('/media/zqh/Documents/JOJO_face_crop_big')
    org2 = Path('/home/zqh/workspace/data512x512')

    test_ids = []
    train_ids = []

    for org in [org1, org2]:
        ids = list(set([p.stem for p in org.iterdir()]))
        n = len(ids)
        test_n = int(n * 0.1)
        for id in ids[:test_n]:
            test_ids.append(org / id)

        for id in ids[test_n:]:
            train_ids.append(org / id)

    for dst_root, ids in [(test, test_ids), (train, train_ids)]:
        total = len(ids)
        pattern = dst_root.as_posix() + f'-{str(total)}-%d.tar'
        with wds.ShardWriter(pattern, maxcount=5000, encoder=False) as f:
            for id in ids:
                with open(id.as_posix() + '.jpg', "rb") as stream:
                    image = stream.read()
                with open(id.as_posix() + '.json', "rb") as stream:
                    json = stream.read()
                key = id.name
                f.write({'__key__': key, 'jpg': image, 'json': json})