def convert_cifar2imagenet_format(): print("Convert cifar10 dataset...") cifar_path = Path(__file__).parent.parent.absolute().joinpath("data").joinpath("cifar10") converted_path = Path(__file__).parent.parent.absolute().joinpath("data").joinpath("cifar10_raw").joinpath("validation") webdata_path = Path(__file__).parent.parent.absolute().joinpath("data").joinpath("cifar10_webdata") if os.path.exists(converted_path) and os.path.exists(webdata_path): return ds = torchvision.datasets.CIFAR10( root=cifar_path.resolve(), train=False, download=True ) dl = DataLoader(ds, batch_size=None) # Create raw imagenet format if not os.path.exists(converted_path): os.makedirs(converted_path) for idx, (img, label) in enumerate(dl): image_folder = os.path.join(converted_path, str(label)) if not os.path.exists(image_folder): os.makedirs(image_folder) img.save(os.path.join(converted_path, str(label), "{}.jpg".format(idx))) # Create webdataset format if not os.path.exists(webdata_path): os.makedirs(webdata_path) with wds.ShardWriter(str(webdata_path.joinpath("validation-%06d.tar")), maxcount=256) as sink: for index, (data, label) in enumerate(dl): sink.write(encode_sample(data, label, index)) # Train dataset train_ds = torchvision.datasets.CIFAR10( root=cifar_path.resolve(), train=True, download=True ) train_dl = DataLoader(train_ds, batch_size=None) with wds.ShardWriter(str(webdata_path.joinpath("train-%06d.tar")), maxcount=256) as sink: for index, (data, label) in enumerate(train_dl): sink.write(encode_sample(data, label, index)) metadata = {"format": "img", "validation_length": 10000, "train_length": 60000, "validation_transform_pipeline": [], "train_transform_pipeline": []} with open(os.path.join(webdata_path, "metadata.json"), "w") as metafile: json.dump(metadata, metafile) # Create distributed chunks for 4 node class HelperClass: pass args = HelperClass() args.num_instances = 4 args.target = webdata_path create_distributed_remaining(args, "validation")
def upload_cv_dataset(ds, client, bucket, base_folder, maxsize, maxcount, workers=0, batch_size=256): loader = ch.utils.data.DataLoader(Packer(ds), batch_size=batch_size, num_workers=workers, shuffle=True, collate_fn=lambda x: x) with tempfile.TemporaryDirectory() as tempfolder: pattern = path.join(tempfolder, f"shard-%06d.tar") writer = partial(upload_shard, client=client, bucket=bucket, base_folder=base_folder) with wds.ShardWriter(pattern, maxsize=int(maxsize), maxcount=int(maxcount), post=writer) as sink: for r in tqdm(loader): for ix, im, label in r: sample = {"__key__": f"im-{ix}", "jpg": im, "cls": label} sink.write(sample)
def split_tiles( images, masks, lus, workers: int, shardpattern: str, **kwargs ) -> List[Any]: """Split tile into subtiles in parallel and save them to disk""" valid_subtiles = kwargs.get("valid_subtiles", None) stats = [] with wds.ShardWriter(shardpattern, maxcount=SHARDSIZE) as sink: data = process_map( partial(_split_tile, **kwargs), images, masks, lus, max_workers=workers, chunksize=1, ) for sample in reduce(lambda z, y: z + y, data): if sample: if valid_subtiles: if sample["__key__"] in valid_subtiles: sink.write(sample) stats.append((sample["__key__"], sample["txt"], "1")) else: if float(sample["txt"]) > 0: sink.write(sample) stats.append((sample["__key__"], sample["txt"], "1")) else: # not included in shard stats.append((sample["__key__"], sample["txt"], "0")) return stats
def write_dataset(dataloader, target_path, chunksize, transform=None, image_quality=95): with wds.ShardWriter(target_path, maxcount=chunksize) as sink: for index, (data, label) in enumerate(tqdm(dataloader)): if isinstance(data, tuple) or isinstance(data, list): bbox = data[1] data = data[0] else: bbox = None if transform is not None: data = transform(data) sink.write(encode_sample(data, label, index, bbox=bbox, image_quality=image_quality))
def write_dataset(dataloader, target_path, chunksize): with wds.ShardWriter(target_path, maxcount=chunksize) as sink: for index, (data, label) in enumerate(tqdm(dataloader)): if isinstance(data, Image.Image): sample = {"__key__": str(index), "jpg": data, "cls": label} else: sample = { "__key__": str(index), "pth": torch.tensor(data * 255, dtype=torch.uint8), "cls": label } sink.write(sample)
def write_dataset(imagenet, base="./shards", split="train"): # We're using the torchvision ImageNet dataset # to parse the metadata; however, we will read # the compressed images directly from disk (to # avoid having to reencode them) ds = datasets.ImageNet(imagenet, split=split) nimages = len(ds.imgs) print("# nimages", nimages) # We shuffle the indexes to make sure that we # don't get any large sequences of a single class # in the dataset. indexes = list(range(nimages)) random.shuffle(indexes) # This is the output pattern under which we write shards. pattern = os.path.join(base, f"imagenet-{split}-%06d.tar") with wds.ShardWriter(pattern, maxsize=int(args.maxsize), maxcount=int(args.maxcount)) as sink: for i in indexes: # Internal information from the ImageNet dataset # instance: the file name and the numerical class. fname, cls = ds.imgs[i] assert cls == ds.targets[i] # Read the JPEG-compressed image file contents. image = readfile(fname) # Construct a uniqu keye from the filename. key = os.path.splitext(os.path.basename(fname))[0] # Useful check. assert key not in all_keys all_keys.add(key) # Construct a sample. xkey = key if args.filekey else "%07d" % i sample = {"__key__": xkey, "jpg": image, "cls": cls} # Write the sample to the sharded tar archives. sink.write(sample)
Path(output_path + "/val").mkdir(parents=True, exist_ok=True) Path(output_path + "/test").mkdir(parents=True, exist_ok=True) random_seed = 25 y = trainset.targets trainset, valset, y_train, y_val = train_test_split( trainset, y, stratify=y, shuffle=True, test_size=0.2, random_state=random_seed) for name in [(trainset, "train"), (valset, "val"), (testset, "test")]: with wds.ShardWriter(output_path + "/" + str(name[1]) + "/" + str(name[1]) + "-%d.tar", maxcount=1000) as sink: for index, (image, cls) in enumerate(name[0]): sink.write({ "__key__": "%06d" % index, "ppm": image, "cls": cls }) entry_point = ["ls", "-R", output_path] run_code = subprocess.run(entry_point, stdout=subprocess.PIPE) print(run_code.stdout) visualization_arguments = { "output": { "mlpipeline_ui_metadata": args["mlpipeline_ui_metadata"],
def write_shards( voxlingua_folder_path: pathlib.Path, shards_path: pathlib.Path, seed: int, samples_per_shard: int, min_dur: float, ): """ Parameters ---------- voxlingua_folder_path: folder where extracted voxceleb data is located shards_path: folder to write shards of data to seed: random seed used to initially shuffle data into shards samples_per_shard: number of data samples to store in each shards. """ # make sure output folder exist shards_path.mkdir(parents=True, exist_ok=True) # find all audio files audio_files = sorted([f for f in voxlingua_folder_path.rglob("*.wav")]) # create tuples (unique_sample_id, language_id, path_to_audio_file, duration) data_tuples = [] # track statistics on data all_language_ids = set() sample_keys_per_language = defaultdict(list) for f in audio_files: # path should be # voxlingua107_folder_path/<LANG_ID>/<VIDEO---0000.000-0000.000.wav> m = re.match( r"(.*/((.+)/.+---(\d\d\d\d\.\d\d\d)-(\d\d\d\d\.\d\d\d))\.wav)", f.as_posix(), ) if m: loc = m.group(1) key = m.group(2) lang = m.group(3) start = float(m.group(4)) end = float(m.group(5)) dur = end - start # Period is not allowed in a WebDataset key name key = key.replace(".", "_") if dur > min_dur: # store statistics all_language_ids.add(lang) sample_keys_per_language[lang].append(key) t = (key, lang, loc, dur) data_tuples.append(t) else: raise Exception("Unexpected wav name: " + f) all_language_ids = sorted(all_language_ids) # write a meta.json file which contains statistics on the data # which will be written to shards meta_dict = { "language_ids": list(all_language_ids), "sample_keys_per_language": sample_keys_per_language, "num_data_samples": len(data_tuples), } with (shards_path / "meta.json").open("w") as f: json.dump(meta_dict, f) # shuffle the tuples so that each shard has a large variety in languages random.seed(seed) random.shuffle(data_tuples) # write shards all_keys = set() shards_path.mkdir(exist_ok=True, parents=True) pattern = str(shards_path / "shard") + "-%06d.tar" with wds.ShardWriter(pattern, maxcount=samples_per_shard) as sink: for key, language_id, f, duration in data_tuples: # load the audio tensor tensor = load_audio(f) # verify key is unique assert key not in all_keys all_keys.add(key) # extract language_id, youtube_id and utterance_id from key # language_id = all_language_ids[language_id_idx] # create sample to write sample = { "__key__": key, "audio.pth": tensor, "language_id": language_id, } # write sample to sink sink.write(sample)
def main(): parser = argparse.ArgumentParser() parser.add_argument("image_dir", type=Path) parser.add_argument("mask_dir", type=Path) parser.add_argument("lu_dir", type=Path) parser.add_argument("outdir", type=Path) num_cores = psutil.cpu_count(logical=False) parser.add_argument( "--workers", dest="workers", type=int, default=num_cores, help="number of workers for parallel execution [def: %(default)s]", ) parser.add_argument( "--source_dim", dest="source_dim", type=int, default=2048, help="size of input tiles [def: %(default)s]", ) parser.add_argument( "--tile_size", dest="tile_size", type=int, default=256, help="size of final tiles that are then passed to the model [def: %(default)s]", ) parser.add_argument( "--format", dest="format", type=str, default="TIFF", choices=["PNG", "TIFF"], help="target file format (PNG, TIFF) [def: %(default)s]", ) parser.add_argument( "--tmp-dir", dest="tmp_dir", type=Path, default=None, help="use this location as tmp dir", ) parser.add_argument( "--subdir", dest="sub_dir", default="train", help="use this location as sub_dir", ) parser.add_argument( "--stats", dest="stats_file", type=Path, default=Path("stats.csv"), help="use this file to record stats", ) args = parser.parse_args() args.outdir.mkdir(parents=True, exist_ok=True) Path(args.outdir / args.sub_dir).mkdir(parents=True, exist_ok=True) if args.tmp_dir: print(f"Using custom tmp dir: {args.tmp_dir}") Path(args.tmp_dir).mkdir(parents=True, exist_ok=True) if args.format == "TIFF": suffix = "tif" elif args.format == "PNG": suffix = "png" else: raise NotImplementedError SHUFFLE = True # shuffle subtile order within shards (with fixed seed) # subtile_stats = split_tiles(train_files) images = sorted(args.image_dir.glob("*.tif")) masks = sorted(args.mask_dir.glob("*.tif")) lus = sorted(args.lu_dir.glob("*.tif")) image_names = {i.name for i in images} mask_names = {i.name for i in masks} lu_names = {i.name for i in lus} # limit set of images to images that have equivalent mask tiles train_images = [ i for i in images if i.name in image_names.intersection(mask_names).intersection(lu_names) ] train_masks = [ i for i in masks if i.name in mask_names.intersection(image_names).intersection(lu_names) ] train_lus = [ i for i in lus if i.name in lu_names.intersection(mask_names).intersection(image_names) ] train_images = sorted(train_images) train_masks = sorted(train_masks) train_lus = sorted(train_lus) # print(len(train_images)) # print(len(train_masks)) # exit() # print(len(train_lus)) cfg = dict( source_dim=args.source_dim, tile_size=args.tile_size, format=args.format, ) subtile_stats = split_tiles( train_images, train_masks, train_lus, args.workers, str(args.outdir / args.sub_dir / "train-%06d.tar"), **cfg, ) with open(args.outdir / args.stats_file, "w") as fout: fout.write("tile,frac,status\n") for i, (fname, frac, status) in enumerate(subtile_stats): line = f"{fname},{frac},{status}\n" fout.write(line) # rebalance shards so we get similar distributions in all shards with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmpdir: print(f"Created a temporary directory: {tmpdir}") print("Extract source tars") # untar input for tf_name in sorted((args.outdir / args.sub_dir).glob("train-00*.tar")): with tarfile.open(tf_name) as tf: tf.extractall(tmpdir) print("Write balanced shards from deadtree samples") df = pd.read_csv(args.outdir / args.stats_file) df = df[df.status > 0] n_valid = len(df) splits = split_df(df, SHARDSIZE) # preserve last shard if more than 50% of values are present if SHARDSIZE // 2 < len(splits[-1]) < SHARDSIZE: # fill last shard with duplicates (not ideal...) n_missing = SHARDSIZE - len(splits[-1]) # df_extra = splits[-1].sample(n=n_missing, random_state=42) splits[-1].extend(np.random.choice(splits[-1], size=n_missing).tolist()) # drop incomplete shards splits = [x for x in splits if len(x) == SHARDSIZE] assert len(splits) > 0, "Something went wrong" for s_cnt, s in enumerate(splits): with tarfile.open( args.outdir / args.sub_dir / f"train-balanced-{s_cnt:06}.tar", "w" ) as dst: if SHUFFLE: random.shuffle(s) for i in s: dst.add(f"{tmpdir}/{i}.mask.{suffix}", f"{i}.mask.{suffix}") dst.add(f"{tmpdir}/{i}.lu.{suffix}", f"{i}.lu.{suffix}") dst.add(f"{tmpdir}/{i}.rgbn.{suffix}", f"{i}.rgbn.{suffix}") dst.add(f"{tmpdir}/{i}.txt", f"{i}.txt") # create sets for random tile dataset # use all subtiles not covered in train n_subtiles = (args.source_dim // args.tile_size) ** 2 all_subtiles = [] for image_name in image_names: all_subtiles.extend( [f"{Path(image_name).stem}_{c:03}" for c in range(n_subtiles)] ) all_subtiles = set(all_subtiles) n_samples = n_valid * OVERSAMPLE_FACTOR random_subtiles = random.sample( tuple(all_subtiles - set([x[0] for x in subtile_stats if int(x[2]) == 1])), n_samples, ) # the necessary tile to process random_tiles = sorted(list(set([x[:-4] for x in random_subtiles]))) all_images = sorted(args.image_dir.glob("*.tif")) random_images = [x for x in all_images if x.stem in random_tiles] print("STATS") print(len(all_subtiles)) print(len(subtile_stats)) print(len(random_subtiles)) print(len(random_images)) cfg = dict( source_dim=args.source_dim, tile_size=args.tile_size, format=args.format, valid_subtiles=random_subtiles, # subset data with random selection of subtiles ) random_images_names = {i.name for i in random_images} random_lus = [i for i in lus if i.name in random_images_names] subtile_stats_rnd = split_tiles( random_images, [None] * len(random_images), random_lus, args.workers, str(args.outdir / args.sub_dir / "train-randomsamples-%06d.tar"), **cfg, ) stats_file_rnd = Path(args.stats_file.stem + "_rnd.csv") with open(args.outdir / stats_file_rnd, "w") as fout: fout.write("tile,frac,status\n") for i, (fname, frac, status) in enumerate(subtile_stats_rnd): line = f"{fname},{frac},{status}\n" fout.write(line) # also create combo dataset # source A: train-balanced, source B: randomsample # NOTE: combo dataset has double the default shardsize (2*128), samples alternate between regular and random sample train_balanced_shards = [ str(x) for x in sorted((args.outdir / args.sub_dir).glob("train-balanced*")) ] train_balanced_shards_rnd = [ str(x) for x in sorted((args.outdir / args.sub_dir).glob("train-random*")) ] train_balanced_shards_rnd = train_balanced_shards_rnd[: len(train_balanced_shards)] shardpattern = str(args.outdir / args.sub_dir / "train-combo-%06d.tar") with wds.ShardWriter(shardpattern, maxcount=SHARDSIZE * 2) as sink: for shardA, shardB in zip(train_balanced_shards, train_balanced_shards_rnd): for sA, sB in zip(wds.WebDataset(shardA), wds.WebDataset(shardB)): sink.write(sA) sink.write(sB) # remove everything but train & combo for filename in (args.outdir / args.sub_dir).glob("train-random*"): filename.unlink() for filename in (args.outdir / args.sub_dir).glob("train-balanced*"): filename.unlink() for filename in (args.outdir / args.sub_dir).glob("train-0*"): filename.unlink()
if not train.exists(): train.mkdir() if not test.exists(): test.mkdir() org1 = Path('/media/zqh/Documents/JOJO_face_crop_big') org2 = Path('/home/zqh/workspace/data512x512') test_ids = [] train_ids = [] for org in [org1, org2]: ids = list(set([p.stem for p in org.iterdir()])) n = len(ids) test_n = int(n * 0.1) for id in ids[:test_n]: test_ids.append(org / id) for id in ids[test_n:]: train_ids.append(org / id) for dst_root, ids in [(test, test_ids), (train, train_ids)]: total = len(ids) pattern = dst_root.as_posix() + f'-{str(total)}-%d.tar' with wds.ShardWriter(pattern, maxcount=5000, encoder=False) as f: for id in ids: with open(id.as_posix() + '.jpg', "rb") as stream: image = stream.read() with open(id.as_posix() + '.json', "rb") as stream: json = stream.read() key = id.name f.write({'__key__': key, 'jpg': image, 'json': json})