def create_test_fixtures(cls): log = util.create_log() log.info("Creating bdd100k test fixtures ...") ZIPS_TO_COPY = (cls.telemetry_zip(), ) util.cleandir(cls.TEST_FIXTURE_DIR) for path in ZIPS_TO_COPY: util.copy_n_from_zip(path, cls.test_fixture(path), 10) # Videos: just copy the ones that have INFO data log.info("Copying videos ...") fws = util.ArchiveFileFlyweight.fws_from( cls.test_fixture(cls.telemetry_zip())) for fw in fws: if 'json' not in fw.name: continue relpath = InfoDataset.json_fname_to_video_fname(fw.name) relpath = relpath[len('bdd100k/info/'):] path = os.path.join(cls.video_dir(), relpath) dest = cls.test_fixture(path) util.mkdir(os.path.dirname(dest)) util.run_cmd('cp -v ' + path + ' ' + dest) log.info("... done copying videos.") # For testing, create a video that has no INFO dest = cls.test_fixture( os.path.join(cls.video_dir(), '100k', 'train', 'video_with_no_info.mov')) codec = 'h264' # Chrome will not play `png` movies video_bytes = testutils.VideoFixture(codec=codec).get_bytes() with open(dest, 'wc') as f: f.write(video_bytes) log.info("Wrote synth video to %s ..." % dest)
def test_spark_numpy_df(): TEST_TEMPDIR = os.path.join(testconf.TEST_TEMPDIR_ROOT, 'spark_numpy_df') util.cleandir(TEST_TEMPDIR) import numpy as np rows = [ { 'id': 1, 'a': np.array([1]), 'b': np.array([[1]]), 'c': np.array([[[1]], [[2]], [[3]]]), }, { 'id': 2, 'a': np.array([]), 'b': None, 'c': None, }, ] # Test serialization numpy <-> parquet with testutils.LocalSpark.sess() as spark: from pyspark.sql import Row wrapped_rows = [ Row(**dict((k, NumpyArray(v)) for k, v in row.iteritems())) for row in rows ] df = spark.createDataFrame(wrapped_rows) df.show() outpath = os.path.join(TEST_TEMPDIR, 'rowdata') df.write.parquet(outpath) df2 = spark.read.parquet(outpath) decoded_wrapped_rows = df2.collect() decoded_rows = [ dict((k, v.arr if v else v) for k, v in row.asDict().iteritems()) for row in decoded_wrapped_rows ] # We can't do assert sorted(rows) == sorted(decoded_rows) # because numpy syntatic sugar breaks == import pprint def sorted_row_str(rowz): return pprint.pformat(sorted(rowz, key=lambda row: row['id'])) assert sorted_row_str(rows) == sorted_row_str(decoded_rows)
def test_spark_archive_zip(): TEST_TEMPDIR = os.path.join(testconf.TEST_TEMPDIR_ROOT, 'test_spark_archive_zip') util.cleandir(TEST_TEMPDIR) # Create the fixture ss = ['foo', 'bar', 'baz'] fixture_path = os.path.join(TEST_TEMPDIR, 'test.zip') import zipfile with zipfile.ZipFile(fixture_path, mode='w') as z: for s in ss: z.writestr(s, s) with testutils.LocalSpark.sess() as spark: rdd = testutils.LocalSpark.archive_rdd(spark, fixture_path) name_data = rdd.map(lambda entry: (entry.name, entry.data)).collect() assert sorted(name_data) == sorted((s, s) for s in ss)
def test_archive_fliyweight_zip(): TEST_TEMPDIR = os.path.join(testconf.TEST_TEMPDIR_ROOT, 'test_archive_flyweight_zip') util.cleandir(TEST_TEMPDIR) # Create the fixture ss = ['foo', 'bar', 'baz'] fixture_path = os.path.join(TEST_TEMPDIR, 'test.zip') import zipfile with zipfile.ZipFile(fixture_path, mode='w') as z: for s in ss: z.writestr(s, s) fws = util.ArchiveFileFlyweight.fws_from(fixture_path) assert len(fws) == len(ss) datas = [fw.data for fw in fws] assert sorted(datas) == sorted(ss)
def test_create_video(): v = testutils.VideoFixture() VID_TEMPDIR = os.path.join(testconf.TEST_TEMPDIR_ROOT, 'test_create_video') util.cleandir(VID_TEMPDIR) path = os.path.join(VID_TEMPDIR, 'test_video.mov') with open(path, 'wc') as f: f.write(v.get_bytes()) print "Wrote video to %s for inspection" % path import imageio reader = imageio.get_reader(path) meta = reader.get_meta_data() assert meta['fps'] == v.fps assert meta['nframes'] == v.n import itertools expected_imgs = itertools.cycle(testutils.iter_video_images(v.n, v.w, v.h)) for im, expected in zip(reader, expected_imgs): assert im.shape == (v.w, v.h, 3) assert (im - expected).sum() == 0
def test_image_table(self): if not self.have_fixtures: return with testutils.LocalSpark.sess() as spark: TABLES = ( TestMSCOCOImageTableTrain, TestMSCOCOImageTableVal, ) for table in TABLES: util.cleandir(table.table_root()) table.setup(spark=spark) util.run_cmd('du -sh %s' % table.table_root()) rows = table.as_imagerow_rdd(spark).collect() uris = [mscoco.ImageURI.from_uri(r.uri) for r in rows] fnames = set(u.image_fname for u in uris) assert len(fnames) == (TestFixtures.NUM_IMAGES_IN_TEST_ZIP - 1) # subtract zip folder entry ... assert set(table.EXPECTED_FNAMES) - fnames == set([]) assert all(len(r.image_bytes) > 0 for r in rows)
def test_imagetable_demo(monkeypatch): TABLE_TEMPDIR = os.path.join(testconf.TEST_TEMPDIR_ROOT, 'ImageTable_pq_demo') util.cleandir(TABLE_TEMPDIR) with monkeypatch.context() as m: m.setattr(conf, 'AU_TABLE_CACHE', TABLE_TEMPDIR) ImageTable.setup() test_img_path = os.path.join(conf.AU_IMAGENET_SAMPLE_IMGS_DIR, '2929331372_398d58807e.jpg') rows = ImageTable.get_rows_by_uris((test_img_path, 'not_in_table')) assert len(rows) == 1 row = rows[0] expected_bytes = open(test_img_path, 'rb').read() assert row.image_bytes == expected_bytes assert row.label == 'coffee' assert len(list(ImageTable.iter_all_rows())) == 6
def test_imagerow_demo(monkeypatch): ## We can create an empty row; all members are strings row = ImageRow() assert row.dataset == '' assert row.split == '' assert row.uri == '' assert row.image_bytes == '' ## Invariants for a row lacking image data: assert not row.as_numpy().any(), "Image has no bytes" assert row.to_debug() is None, "No image bytes to write" ## We can use kwargs to init any desired attribute row = ImageRow(dataset='test1') assert row.dataset == 'test1' assert row.image_bytes == '' ## ImageRows and dicts are interchangeable (see kwargs demo above) empty_row_as_dict = { 'dataset': '', 'split': '', 'uri': '', 'image_bytes': '', 'label': '', 'attrs': '', } assert ImageRow().to_dict() == empty_row_as_dict ## We can wrap a closure that generates an image def gen_img(): return np.zeros((32, 32)) row = ImageRow.wrap_factory(gen_img) assert len(row.image_bytes) == 75 assert row.as_numpy().shape == (32, 32) ## We can instantiate from a file on disk row = ImageRow.from_path(testconf.MNIST_TEST_IMG_PATH, dataset='test2') assert row.dataset == 'test2' assert len(row.image_bytes) == 250 assert row.as_numpy().shape == (28, 28) ## We can dump a row to disk for quick inspection with monkeypatch.context() as m: m.setattr(conf, 'AU_CACHE_TMP', testconf.TEST_TEMPDIR_ROOT) dest = row.to_debug(fname='ImageRowTest.png') assert os.path.exists(dest) expected = imageio.imread(testconf.MNIST_TEST_IMG_PATH) np.testing.assert_array_equal(row.as_numpy(), expected) ## The real warrant for ImageRow is so that we can store datasets of ## images using parquet and manipulate them easily using Spark and Pandas rows = ImageRow.rows_from_images_dir(conf.AU_IMAGENET_SAMPLE_IMGS_DIR, dataset='d') rows = list(rows) assert len(rows) >= 6 # # Rows can have labels of various types, too # rows[0].label = 'fake_label' # rows[1].label = 4 # fake label # rows[2].label = [1, 2] # rows[3].label = np.array([1, 2]) # rows[4].label = {'key': 'value'} train = rows[:4] test = rows[4:] for r in train: r.split = 'train' for r in test: r.split = 'test' PQ_TEMPDIR = os.path.join(testconf.TEST_TEMPDIR_ROOT, 'ImageRow_pq_demo') util.cleandir(PQ_TEMPDIR) ImageRow.write_to_parquet(train, PQ_TEMPDIR) ImageRow.write_to_parquet(test, PQ_TEMPDIR) # pyarrow's parquet writer should have created some nice partitioned # directories for split in ('train', 'test'): d = os.path.join(PQ_TEMPDIR, 'dataset=d', 'split=%s' % split) assert os.path.exists(d) # Now try reading it back import pandas as pd import pyarrow as pa import pyarrow.parquet as pq pa_table = pq.read_table(PQ_TEMPDIR) df = pa_table.to_pandas() # Did we read back the correct images? assert set(df['uri']) == set(r.uri for r in rows) # Are the splits correct? expected_uri_to_split = {} expected_uri_to_split.update((r.uri, r.split) for r in train) expected_uri_to_split.update((r.uri, r.split) for r in test) df_rows = df.loc[:, ['uri', 'split']].to_dict(orient='records') actual_uri_to_split = dict((d['uri'], d['split']) for d in df_rows) assert actual_uri_to_split == expected_uri_to_split # Check the table contents; we should see the image bytes are identical # to the files for decoded_row in ImageRow.from_pandas(df): assert os.path.exists(decoded_row.uri) expected_bytes = open(decoded_row.uri, 'rb').read() assert decoded_row.image_bytes == expected_bytes ## We can also dump sets of rows as PNGs partitioned by dataset with monkeypatch.context() as m: m.setattr(conf, 'AU_DATA_CACHE', testconf.TEST_TEMPDIR_ROOT) ImageRow.write_to_pngs(rows) def expect_file(relpath, uri_to_expected): path = os.path.join(testconf.TEST_TEMPDIR_ROOT, relpath) assert os.path.exists(path) expected_bytes = open(uri_to_expected, 'rb').read() actual_bytes = open(path, 'rb').read() assert expected_bytes == actual_bytes expect_file(os.path.join('d/train', train[0].fname()), train[0].uri) expect_file(os.path.join('d/test', test[0].fname()), test[0].uri)
def _create_egg(cls, src_root=None, tmp_path=None): """Build a Python Egg from the current project and return a path to the artifact. Why an Egg? `pyspark` supports zipfiles and egg files as Python artifacts. One might wish to use a wheel instead of an egg. See this excellent article and repo: * https://bytes.grubhub.com/managing-dependencies-and-artifacts-in-pyspark-7641aa89ddb7 * https://github.com/alekseyig/spark-submit-deps The drawbacks to using a wheel include: * wheels often require native libraries to be installed (e.g. via `apt-get`), and those deps are typically best baked into the Spark Worker environment (versus installed every job run). * The `BdistSpark` example above is actually rather slow, especially when Tensorflow is a dependency, and `BdistSpark` must run before every job is submitted. * Spark treats wheels as zip files and unzips them on every run; this unzip operation can be very expensive if the zipfile contains large binaries (e.g. tensorflow) In comparison, an Egg provides the main benefits we want (to ship project code, often pre-committed code, to workers). """ log = util.create_log() if tmp_path is None: import tempfile tempdir = tempfile.gettempdir() SUBDIR_NAME = 'au_eggs' tmp_path = os.path.join(tempdir, SUBDIR_NAME) util.cleandir(tmp_path) if src_root is None: log.info("Trying to auto-resolve path to src root ...") try: import inspect path = inspect.getfile(inspect.currentframe()) src_root = os.path.dirname(os.path.abspath(path)) except Exception as e: log.info( "Failed to auto-resolve src root, " "falling back to %s" % cls.SRC_ROOT) src_root = cls.SRC_ROOT src_root = '/opt/au' log.info("Using source root %s " % src_root) # Below is a programmatic way to run something like: # $ cd /opt/au && python setup.py clearn bdist_egg # Based upon https://github.com/pypa/setuptools/blob/a94ccbf404a79d56f9b171024dee361de9a948da/setuptools/tests/test_bdist_egg.py#L30 # See also: # * https://github.com/pypa/setuptools/blob/f52b3b1c976e54df7a70db42bf59ca283412b461/setuptools/dist.py # * https://github.com/pypa/setuptools/blob/46af765c49f548523b8212f6e08e1edb12f22ab6/setuptools/tests/test_sdist.py#L123 # * https://github.com/pypa/setuptools/blob/566f3aadfa112b8d6b9a1ecf5178552f6e0f8c6c/setuptools/__init__.py#L51 from setuptools.dist import Distribution from setuptools import PackageFinder MODNAME = os.path.split(src_root)[-1] dist = Distribution(attrs=dict( script_name='setup.py', script_args=[ 'clean', 'bdist_egg', '--dist-dir', tmp_path, '--bdist-dir', os.path.join(tmp_path, 'workdir'), ], name=MODNAME, src_root=src_root, packages=PackageFinder.find(where=src_root), )) log.info("Generating egg to %s ..." % tmp_path) with util.quiet(): dist.parse_command_line() dist.run_commands() egg_path = os.path.join(tmp_path, MODNAME + '-0.0.0-py2.7.egg') assert os.path.exists(egg_path) log.info("... done. Egg at %s" % egg_path) return egg_path
def run_import(cls): import argparse import pprint parser = argparse.ArgumentParser( description=("Help import and set up the bdd100k dataset. " "FMI see /docs/bdd100k.md")) parser.add_argument('--src', default='/outer_root/tmp', help='Find zips in this dir [default %(default)s]') parser.add_argument( '--dest', default=cls.ROOT, help='Place files in this dir [default %(default)s]') parser.add_argument( '--num-videos', default=1000, help='Expand only this many video files [default %(default)s]; ') parser.add_argument( '--all-videos', default=False, action='store_true', help='Expand all videos (equivalent to --num-videos=-1)') parser.add_argument('--skip-reindex', default=False, action='store_true', help='Skip extended setup / (re-)index phase') parser.add_argument('--dry-run', default=False, action='store_true', help='Only show what would happen') args = parser.parse_args() EXPECTED_FILE_TO_DEST = { 'bdd100k_info.zip': cls.telemetry_zip(), 'bdd100k_videos.zip': cls.video_zip(), # 'bdd100k_drivable_maps.zip': ?, # 'bdd100k_images.zip': ?, # 'bdd100k_labels_release.zip': ?, # 'bdd100k_seg.zip': ?, } src_paths = list(util.all_files_recursive(args.src)) found = (set(EXPECTED_FILE_TO_DEST.keys()) & set(os.path.dirname(p) for p in src_paths)) if found: print "Found the following files, which we will import:" pprint.pprint(list(found)) spark = Spark.getOrCreate() ### Emplace Data def get_path(fname, paths): for p in paths: if fname in paths: return p def run_safe(cmd): if args.dry_run: print "DRY RUN SKIPPED: " + cmd else: util.run_cmd(cmd) for fname, dest in sorted(EXPECTED_FILE_TO_DEST.iteritems()): src_path = get_path(fname, src_paths) if not src_path: continue if fname == 'bdd100k_videos.zip': cmd = 'ln -s %s %s' % (src_path, dest) run_safe(cmd) archive_rdd = Spark.archive_rdd(spark, cls.FIXTURES.video_zip()) archive_rdd = archive_rdd.filter(lambda fw: 'mov' in fw.name) n_vids = archive_rdd.count() print "Found %s videos in %s ..." % (n_vids, src_path) if args.all_videos: max_videos = n_vids else: max_videos = min(n_vids, args.num_videes) vids = sorted(archive_rdd.map(lambda fw: fw.name).collect()) vids = set(vids[:max_videos]) vids_to_import = archive_rdd.filter(lambda fw: fw.name in vids) print "... importing %s videos ..." % len( vids_to_import.count()) dry_run = args.dry_run dest_dir = cls.video_dir() def copy_vid(fw): vid_dest = os.path.join(dest_dir, fw.name) util.mkdir(os.path.dirname(vid_dest)) if dry_run: print "DRY RUN SKIPPED: " + f.name else: with open(vid_dest, 'wc') as f: f.write(fw.data) vids_to_import.foreach(copy_vid) print "... import complete! Imported to %s ." % dest_dir else: cmd = 'cp -v %s %s' % (src_path, dest) run_safe(cmd) ### Index Data if args.skip_reindex: print "Skipping (re-)index phase" return if args.dry_run: print "DRY RUN SKIPPED index & setup phase" else: print "Cleaning index and debug dirs ..." util.cleandir(cls.video_index_root()) util.cleandir(cls.video_debug_dir()) print "Running video setup ..." VideoDataset.setup(spark, all_videos=args.all_videos)