Пример #1
0
    def create_test_fixtures(cls):
        log = util.create_log()

        log.info("Creating bdd100k test fixtures ...")
        ZIPS_TO_COPY = (cls.telemetry_zip(), )

        util.cleandir(cls.TEST_FIXTURE_DIR)
        for path in ZIPS_TO_COPY:
            util.copy_n_from_zip(path, cls.test_fixture(path), 10)

        # Videos: just copy the ones that have INFO data
        log.info("Copying videos ...")
        fws = util.ArchiveFileFlyweight.fws_from(
            cls.test_fixture(cls.telemetry_zip()))
        for fw in fws:
            if 'json' not in fw.name:
                continue

            relpath = InfoDataset.json_fname_to_video_fname(fw.name)
            relpath = relpath[len('bdd100k/info/'):]
            path = os.path.join(cls.video_dir(), relpath)
            dest = cls.test_fixture(path)
            util.mkdir(os.path.dirname(dest))
            util.run_cmd('cp -v ' + path + ' ' + dest)
        log.info("... done copying videos.")

        # For testing, create a video that has no INFO
        dest = cls.test_fixture(
            os.path.join(cls.video_dir(), '100k', 'train',
                         'video_with_no_info.mov'))
        codec = 'h264'  # Chrome will not play `png` movies
        video_bytes = testutils.VideoFixture(codec=codec).get_bytes()
        with open(dest, 'wc') as f:
            f.write(video_bytes)
        log.info("Wrote synth video to %s ..." % dest)
Пример #2
0
def test_spark_numpy_df():
    TEST_TEMPDIR = os.path.join(testconf.TEST_TEMPDIR_ROOT, 'spark_numpy_df')
    util.cleandir(TEST_TEMPDIR)

    import numpy as np
    rows = [
        {
            'id': 1,
            'a': np.array([1]),
            'b': np.array([[1]]),
            'c': np.array([[[1]], [[2]], [[3]]]),
        },
        {
            'id': 2,
            'a': np.array([]),
            'b': None,
            'c': None,
        },
    ]

    # Test serialization numpy <-> parquet
    with testutils.LocalSpark.sess() as spark:
        from pyspark.sql import Row

        wrapped_rows = [
            Row(**dict((k, NumpyArray(v)) for k, v in row.iteritems()))
            for row in rows
        ]

        df = spark.createDataFrame(wrapped_rows)
        df.show()
        outpath = os.path.join(TEST_TEMPDIR, 'rowdata')
        df.write.parquet(outpath)

        df2 = spark.read.parquet(outpath)
        decoded_wrapped_rows = df2.collect()

        decoded_rows = [
            dict((k, v.arr if v else v) for k, v in row.asDict().iteritems())
            for row in decoded_wrapped_rows
        ]

        # We can't do assert sorted(rows) == sorted(decoded_rows)
        # because numpy syntatic sugar breaks ==
        import pprint

        def sorted_row_str(rowz):
            return pprint.pformat(sorted(rowz, key=lambda row: row['id']))

        assert sorted_row_str(rows) == sorted_row_str(decoded_rows)
Пример #3
0
def test_spark_archive_zip():
    TEST_TEMPDIR = os.path.join(testconf.TEST_TEMPDIR_ROOT,
                                'test_spark_archive_zip')
    util.cleandir(TEST_TEMPDIR)

    # Create the fixture
    ss = ['foo', 'bar', 'baz']

    fixture_path = os.path.join(TEST_TEMPDIR, 'test.zip')

    import zipfile
    with zipfile.ZipFile(fixture_path, mode='w') as z:
        for s in ss:
            z.writestr(s, s)

    with testutils.LocalSpark.sess() as spark:
        rdd = testutils.LocalSpark.archive_rdd(spark, fixture_path)
        name_data = rdd.map(lambda entry: (entry.name, entry.data)).collect()
        assert sorted(name_data) == sorted((s, s) for s in ss)
Пример #4
0
def test_archive_fliyweight_zip():
    TEST_TEMPDIR = os.path.join(testconf.TEST_TEMPDIR_ROOT,
                                'test_archive_flyweight_zip')
    util.cleandir(TEST_TEMPDIR)

    # Create the fixture
    ss = ['foo', 'bar', 'baz']

    fixture_path = os.path.join(TEST_TEMPDIR, 'test.zip')

    import zipfile
    with zipfile.ZipFile(fixture_path, mode='w') as z:
        for s in ss:
            z.writestr(s, s)

    fws = util.ArchiveFileFlyweight.fws_from(fixture_path)
    assert len(fws) == len(ss)
    datas = [fw.data for fw in fws]
    assert sorted(datas) == sorted(ss)
Пример #5
0
def test_create_video():
    v = testutils.VideoFixture()

    VID_TEMPDIR = os.path.join(testconf.TEST_TEMPDIR_ROOT, 'test_create_video')
    util.cleandir(VID_TEMPDIR)
    path = os.path.join(VID_TEMPDIR, 'test_video.mov')
    with open(path, 'wc') as f:
        f.write(v.get_bytes())
        print "Wrote video to %s for inspection" % path

    import imageio
    reader = imageio.get_reader(path)
    meta = reader.get_meta_data()
    assert meta['fps'] == v.fps
    assert meta['nframes'] == v.n

    import itertools
    expected_imgs = itertools.cycle(testutils.iter_video_images(v.n, v.w, v.h))
    for im, expected in zip(reader, expected_imgs):
        assert im.shape == (v.w, v.h, 3)
        assert (im - expected).sum() == 0
Пример #6
0
    def test_image_table(self):
        if not self.have_fixtures:
            return

        with testutils.LocalSpark.sess() as spark:
            TABLES = (
                TestMSCOCOImageTableTrain,
                TestMSCOCOImageTableVal,
            )

            for table in TABLES:
                util.cleandir(table.table_root())
                table.setup(spark=spark)
                util.run_cmd('du -sh %s' % table.table_root())

                rows = table.as_imagerow_rdd(spark).collect()
                uris = [mscoco.ImageURI.from_uri(r.uri) for r in rows]
                fnames = set(u.image_fname for u in uris)
                assert len(fnames) == (TestFixtures.NUM_IMAGES_IN_TEST_ZIP - 1)
                # subtract zip folder entry ...
                assert set(table.EXPECTED_FNAMES) - fnames == set([])
                assert all(len(r.image_bytes) > 0 for r in rows)
Пример #7
0
def test_imagetable_demo(monkeypatch):

    TABLE_TEMPDIR = os.path.join(testconf.TEST_TEMPDIR_ROOT,
                                 'ImageTable_pq_demo')
    util.cleandir(TABLE_TEMPDIR)

    with monkeypatch.context() as m:
        m.setattr(conf, 'AU_TABLE_CACHE', TABLE_TEMPDIR)

        ImageTable.setup()

        test_img_path = os.path.join(conf.AU_IMAGENET_SAMPLE_IMGS_DIR,
                                     '2929331372_398d58807e.jpg')
        rows = ImageTable.get_rows_by_uris((test_img_path, 'not_in_table'))
        assert len(rows) == 1
        row = rows[0]

        expected_bytes = open(test_img_path, 'rb').read()
        assert row.image_bytes == expected_bytes
        assert row.label == 'coffee'

        assert len(list(ImageTable.iter_all_rows())) == 6
Пример #8
0
def test_imagerow_demo(monkeypatch):

    ## We can create an empty row; all members are strings
    row = ImageRow()
    assert row.dataset == ''
    assert row.split == ''
    assert row.uri == ''
    assert row.image_bytes == ''

    ## Invariants for a row lacking image data:
    assert not row.as_numpy().any(), "Image has no bytes"
    assert row.to_debug() is None, "No image bytes to write"

    ## We can use kwargs to init any desired attribute
    row = ImageRow(dataset='test1')
    assert row.dataset == 'test1'
    assert row.image_bytes == ''

    ## ImageRows and dicts are interchangeable (see kwargs demo above)
    empty_row_as_dict = {
        'dataset': '',
        'split': '',
        'uri': '',
        'image_bytes': '',
        'label': '',
        'attrs': '',
    }
    assert ImageRow().to_dict() == empty_row_as_dict

    ## We can wrap a closure that generates an image
    def gen_img():
        return np.zeros((32, 32))

    row = ImageRow.wrap_factory(gen_img)
    assert len(row.image_bytes) == 75
    assert row.as_numpy().shape == (32, 32)

    ## We can instantiate from a file on disk
    row = ImageRow.from_path(testconf.MNIST_TEST_IMG_PATH, dataset='test2')
    assert row.dataset == 'test2'
    assert len(row.image_bytes) == 250
    assert row.as_numpy().shape == (28, 28)

    ## We can dump a row to disk for quick inspection
    with monkeypatch.context() as m:
        m.setattr(conf, 'AU_CACHE_TMP', testconf.TEST_TEMPDIR_ROOT)
        dest = row.to_debug(fname='ImageRowTest.png')
        assert os.path.exists(dest)
        expected = imageio.imread(testconf.MNIST_TEST_IMG_PATH)
        np.testing.assert_array_equal(row.as_numpy(), expected)

    ## The real warrant for ImageRow is so that we can store datasets of
    ## images using parquet and manipulate them easily using Spark and Pandas
    rows = ImageRow.rows_from_images_dir(conf.AU_IMAGENET_SAMPLE_IMGS_DIR,
                                         dataset='d')
    rows = list(rows)
    assert len(rows) >= 6

    #   # Rows can have labels of various types, too
    #   rows[0].label = 'fake_label'
    #   rows[1].label = 4 # fake label
    #   rows[2].label = [1, 2]
    #   rows[3].label = np.array([1, 2])
    #   rows[4].label = {'key': 'value'}

    train = rows[:4]
    test = rows[4:]

    for r in train:
        r.split = 'train'
    for r in test:
        r.split = 'test'

    PQ_TEMPDIR = os.path.join(testconf.TEST_TEMPDIR_ROOT, 'ImageRow_pq_demo')
    util.cleandir(PQ_TEMPDIR)

    ImageRow.write_to_parquet(train, PQ_TEMPDIR)
    ImageRow.write_to_parquet(test, PQ_TEMPDIR)

    # pyarrow's parquet writer should have created some nice partitioned
    # directories
    for split in ('train', 'test'):
        d = os.path.join(PQ_TEMPDIR, 'dataset=d', 'split=%s' % split)
        assert os.path.exists(d)

    # Now try reading it back
    import pandas as pd
    import pyarrow as pa
    import pyarrow.parquet as pq

    pa_table = pq.read_table(PQ_TEMPDIR)
    df = pa_table.to_pandas()

    # Did we read back the correct images?
    assert set(df['uri']) == set(r.uri for r in rows)

    # Are the splits correct?
    expected_uri_to_split = {}
    expected_uri_to_split.update((r.uri, r.split) for r in train)
    expected_uri_to_split.update((r.uri, r.split) for r in test)
    df_rows = df.loc[:, ['uri', 'split']].to_dict(orient='records')
    actual_uri_to_split = dict((d['uri'], d['split']) for d in df_rows)
    assert actual_uri_to_split == expected_uri_to_split

    # Check the table contents; we should see the image bytes are identical
    # to the files
    for decoded_row in ImageRow.from_pandas(df):
        assert os.path.exists(decoded_row.uri)
        expected_bytes = open(decoded_row.uri, 'rb').read()
        assert decoded_row.image_bytes == expected_bytes

    ## We can also dump sets of rows as PNGs partitioned by dataset
    with monkeypatch.context() as m:
        m.setattr(conf, 'AU_DATA_CACHE', testconf.TEST_TEMPDIR_ROOT)
        ImageRow.write_to_pngs(rows)

        def expect_file(relpath, uri_to_expected):
            path = os.path.join(testconf.TEST_TEMPDIR_ROOT, relpath)
            assert os.path.exists(path)
            expected_bytes = open(uri_to_expected, 'rb').read()
            actual_bytes = open(path, 'rb').read()
            assert expected_bytes == actual_bytes

        expect_file(os.path.join('d/train', train[0].fname()), train[0].uri)
        expect_file(os.path.join('d/test', test[0].fname()), test[0].uri)
Пример #9
0
  def _create_egg(cls, src_root=None, tmp_path=None):
    """Build a Python Egg from the current project and return a path
    to the artifact.  

    Why an Egg?  `pyspark` supports zipfiles and egg files as Python artifacts.
    One might wish to use a wheel instead of an egg.  See this excellent
    article and repo:
     * https://bytes.grubhub.com/managing-dependencies-and-artifacts-in-pyspark-7641aa89ddb7
     * https://github.com/alekseyig/spark-submit-deps
    
    The drawbacks to using a wheel include:
     * wheels often require native libraries to be installed (e.g. via
        `apt-get`), and those deps are typically best baked into the Spark
        Worker environment (versus installed every job run).
     * The `BdistSpark` example above is actually rather slow, especially
        when Tensorflow is a dependency, and `BdistSpark` must run before
        every job is submitted.
     * Spark treats wheels as zip files and unzips them on every run; this
        unzip operation can be very expensive if the zipfile contains large
        binaries (e.g. tensorflow)
    
    In comparison, an Egg provides the main benefits we want (to ship project
    code, often pre-committed code, to workers).
    """

    log = util.create_log()

    if tmp_path is None:
      import tempfile
      tempdir = tempfile.gettempdir()

      SUBDIR_NAME = 'au_eggs'
      tmp_path = os.path.join(tempdir, SUBDIR_NAME)
      util.cleandir(tmp_path)

    if src_root is None:
      log.info("Trying to auto-resolve path to src root ...")
      try:
        import inspect
        path = inspect.getfile(inspect.currentframe())
        src_root = os.path.dirname(os.path.abspath(path))
      except Exception as e:
        log.info(
          "Failed to auto-resolve src root, "
          "falling back to %s" % cls.SRC_ROOT)
        src_root = cls.SRC_ROOT
    
    src_root = '/opt/au'
    log.info("Using source root %s " % src_root)

    # Below is a programmatic way to run something like:
    # $ cd /opt/au && python setup.py clearn bdist_egg
    # Based upon https://github.com/pypa/setuptools/blob/a94ccbf404a79d56f9b171024dee361de9a948da/setuptools/tests/test_bdist_egg.py#L30
    # See also: 
    # * https://github.com/pypa/setuptools/blob/f52b3b1c976e54df7a70db42bf59ca283412b461/setuptools/dist.py
    # * https://github.com/pypa/setuptools/blob/46af765c49f548523b8212f6e08e1edb12f22ab6/setuptools/tests/test_sdist.py#L123
    # * https://github.com/pypa/setuptools/blob/566f3aadfa112b8d6b9a1ecf5178552f6e0f8c6c/setuptools/__init__.py#L51
    from setuptools.dist import Distribution
    from setuptools import PackageFinder
    MODNAME = os.path.split(src_root)[-1]
    dist = Distribution(attrs=dict(
        script_name='setup.py',
        script_args=[
          'clean',
          'bdist_egg', 
            '--dist-dir', tmp_path,
            '--bdist-dir', os.path.join(tmp_path, 'workdir'),
        ],
        name=MODNAME,
        src_root=src_root,
        packages=PackageFinder.find(where=src_root),
    ))
    log.info("Generating egg to %s ..." % tmp_path)
    with util.quiet():
      dist.parse_command_line()
      dist.run_commands()

    egg_path = os.path.join(tmp_path, MODNAME + '-0.0.0-py2.7.egg')
    assert os.path.exists(egg_path)
    log.info("... done.  Egg at %s" % egg_path)
    return egg_path
Пример #10
0
    def run_import(cls):
        import argparse
        import pprint

        parser = argparse.ArgumentParser(
            description=("Help import and set up the bdd100k dataset. "
                         "FMI see /docs/bdd100k.md"))
        parser.add_argument('--src',
                            default='/outer_root/tmp',
                            help='Find zips in this dir [default %(default)s]')
        parser.add_argument(
            '--dest',
            default=cls.ROOT,
            help='Place files in this dir [default %(default)s]')
        parser.add_argument(
            '--num-videos',
            default=1000,
            help='Expand only this many video files [default %(default)s]; ')
        parser.add_argument(
            '--all-videos',
            default=False,
            action='store_true',
            help='Expand all videos (equivalent to --num-videos=-1)')
        parser.add_argument('--skip-reindex',
                            default=False,
                            action='store_true',
                            help='Skip extended setup / (re-)index phase')
        parser.add_argument('--dry-run',
                            default=False,
                            action='store_true',
                            help='Only show what would happen')

        args = parser.parse_args()

        EXPECTED_FILE_TO_DEST = {
            'bdd100k_info.zip': cls.telemetry_zip(),
            'bdd100k_videos.zip': cls.video_zip(),
            # 'bdd100k_drivable_maps.zip': ?,
            # 'bdd100k_images.zip': ?,
            # 'bdd100k_labels_release.zip': ?,
            # 'bdd100k_seg.zip': ?,
        }

        src_paths = list(util.all_files_recursive(args.src))
        found = (set(EXPECTED_FILE_TO_DEST.keys())
                 & set(os.path.dirname(p) for p in src_paths))
        if found:
            print "Found the following files, which we will import:"
            pprint.pprint(list(found))

        spark = Spark.getOrCreate()

        ### Emplace Data

        def get_path(fname, paths):
            for p in paths:
                if fname in paths:
                    return p

        def run_safe(cmd):
            if args.dry_run:
                print "DRY RUN SKIPPED: " + cmd
            else:
                util.run_cmd(cmd)

        for fname, dest in sorted(EXPECTED_FILE_TO_DEST.iteritems()):
            src_path = get_path(fname, src_paths)
            if not src_path:
                continue

            if fname == 'bdd100k_videos.zip':
                cmd = 'ln -s %s %s' % (src_path, dest)
                run_safe(cmd)

                archive_rdd = Spark.archive_rdd(spark,
                                                cls.FIXTURES.video_zip())
                archive_rdd = archive_rdd.filter(lambda fw: 'mov' in fw.name)
                n_vids = archive_rdd.count()
                print "Found %s videos in %s ..." % (n_vids, src_path)

                if args.all_videos:
                    max_videos = n_vids
                else:
                    max_videos = min(n_vids, args.num_videes)

                vids = sorted(archive_rdd.map(lambda fw: fw.name).collect())
                vids = set(vids[:max_videos])
                vids_to_import = archive_rdd.filter(lambda fw: fw.name in vids)
                print "... importing %s videos ..." % len(
                    vids_to_import.count())

                dry_run = args.dry_run
                dest_dir = cls.video_dir()

                def copy_vid(fw):
                    vid_dest = os.path.join(dest_dir, fw.name)
                    util.mkdir(os.path.dirname(vid_dest))
                    if dry_run:
                        print "DRY RUN SKIPPED: " + f.name
                    else:
                        with open(vid_dest, 'wc') as f:
                            f.write(fw.data)

                vids_to_import.foreach(copy_vid)

                print "... import complete! Imported to %s ." % dest_dir

            else:
                cmd = 'cp -v %s %s' % (src_path, dest)
                run_safe(cmd)

        ### Index Data
        if args.skip_reindex:
            print "Skipping (re-)index phase"
            return

        if args.dry_run:
            print "DRY RUN SKIPPED index & setup phase"
        else:
            print "Cleaning index and debug dirs ..."
            util.cleandir(cls.video_index_root())
            util.cleandir(cls.video_debug_dir())

            print "Running video setup ..."
            VideoDataset.setup(spark, all_videos=args.all_videos)