def main(): t1 = time.time() hub.init(processes=True, n_workers=psutil.cpu_count(), memory_limit=55e9) parser = argparse.ArgumentParser() parser.add_argument( "dataset_path", metavar="P", type=str, help="Path to coco2017 dataset", default="./data/COCOdataset2017", ) parser.add_argument( "output_path", metavar="N", type=str, help="Dataset output path", default="COCOdataset2017", ) parser.add_argument("year", metavar="Y", type=str, default="2017") args = parser.parse_args() tags = ["train", "val"] ds = {tag: load_dataset(args, tag) for tag in tags} for tag in ds: print(f"{tag}: {len(ds[tag])} samples") ds = dataset.concat([ds[tag] for tag in tags]) # ds = ds["train"] ds.store(f"{args.output_path}") t2 = time.time() logger.info(f"Pipeline took {(t2 - t1) / 60} minutes")
def test_dataset_concat(): t1 = tensor.from_array(np.array([5, 6, 7], dtype="int32")) t2 = tensor.from_array(np.array([1, 2, 3], dtype="int32")) ds1 = dataset.from_tensors({"t1": t1}) ds2 = dataset.from_tensors({"t1": t2}) ds = dataset.concat([ds1, ds2]) assert len(ds) == 6 assert (ds["t1"].compute() == np.array([5, 6, 7, 1, 2, 3], dtype="int32")).all()