def main(): t1 = time.time() hub.init(processes=True, n_workers=psutil.cpu_count(), memory_limit=55e9) parser = argparse.ArgumentParser() parser.add_argument( "dataset_path", metavar="P", type=str, help="Path to coco2017 dataset", default="./data/COCOdataset2017", ) parser.add_argument( "output_path", metavar="N", type=str, help="Dataset output path", default="COCOdataset2017", ) parser.add_argument("year", metavar="Y", type=str, default="2017") args = parser.parse_args() tags = ["train", "val"] ds = {tag: load_dataset(args, tag) for tag in tags} for tag in ds: print(f"{tag}: {len(ds[tag])} samples") ds = dataset.concat([ds[tag] for tag in tags]) # ds = ds["train"] ds.store(f"{args.output_path}") t2 = time.time() logger.info(f"Pipeline took {(t2 - t1) / 60} minutes")
def main(): parser = argparse.ArgumentParser() parser.add_argument("-c", "--config", help="configuration name, used to find subdirectory in $root/config/", action="store", default='') args = parser.parse_args() hub.init(configName=args.config) startAllListeners(CPL.cfg.get('hub', 'listeners', doFlush=True)) hub.run()
"download": t3 - t2, "write_to_fs": t1 - t0, } def upload_and_download(samples=30, chunksize=None, name="hub"): """ Uploads dataset into S3 and then downlods using hub package """ ds = generate_dataset([(samples, 256, 256), (samples, 256, 256)], chunksize=1) t1 = time.time() ds = ds.store(f"{BUCKET}/transfer/upload") t2 = time.time() ds.store("/tmp/download") t3 = time.time() return {"name": name, "upload": t2 - t1, "download": t3 - t2} if __name__ == "__main__": samples = 64 chunksize = None import hub hub.init(processes=True, n_workers=8, threads_per_worker=1) r1 = upload_and_download(samples, chunksize=chunksize) r2 = aws_cli_copy(samples, chunksize=chunksize) report([r1, r2])