예제 #1
0
class AnalyzerCacheTest(test_case.TransformTestCase):
    def test_validate_dataset_keys(self):
        analyzer_cache.validate_dataset_keys(
            {'foo', 'Foo', 'A1', 'A_1', 'A.1', 'A-1'})

        for key in {'foo 1', 'foo@1', 'foo*', 'foo[]', 'foo/goo'}:
            with self.assertRaisesRegexp(
                    ValueError,
                    'Dataset key .* does not match allowed pattern:'):
                analyzer_cache.validate_dataset_keys({key})

    @test_case.named_parameters(
        dict(testcase_name='JsonNumpyCacheCoder',
             coder=analyzer_nodes.JsonNumpyCacheCoder(),
             value=[1, 2.5, 3, '4']),
        dict(testcase_name='JsonNumpyCacheCoderNpArray',
             coder=analyzer_nodes.JsonNumpyCacheCoder(),
             value=np.array([1, 2.5, 3, '4'])),
        dict(testcase_name='JsonNumpyCacheCoderNestedNpTypes',
             coder=analyzer_nodes.JsonNumpyCacheCoder(),
             value=[np.int64(1), np.float32(2.5), 3, '4']),
        dict(testcase_name='_VocabularyAccumulatorCoderIntAccumulator',
             coder=analyzer_nodes._VocabularyAccumulatorCoder(),
             value=[b'A', 17]),
        dict(testcase_name='_VocabularyAccumulatorCoderIntAccumulatorNonUtf8',
             coder=analyzer_nodes._VocabularyAccumulatorCoder(),
             value=[b'\x8a', 29]),
        dict(testcase_name='_VocabularyAccumulatorCoderClassAccumulator',
             coder=analyzer_nodes._VocabularyAccumulatorCoder(),
             value=[
                 b'A',
                 analyzers._WeightedMeanAndVarAccumulator(
                     count=np.array(5),
                     mean=np.array([.4, .9, 1.5]),
                     variance=np.array([.1, .4, .5]),
                     weight=np.array(0.),
                 )
             ]),
        dict(
            testcase_name='_QuantilesAccumulatorCoderClassAccumulator',
            coder=analyzers._QuantilesAccumulatorCacheCoder(),
            value=[
                '\n\x0f\r\x00\x00 A\x15\x00\x00\x80?%\x00\x00\x80?\n\x14\r\x00\x00@A\x15\x00\x00\x80?\x1d\x00\x00\x80?%\x00\x00\x00@',
                '',
                _get_quantiles_summary()
            ]),
        dict(testcase_name='_CombinerPerKeyAccumulatorCoder',
             coder=analyzer_nodes._CombinerPerKeyAccumulatorCoder(
                 analyzer_nodes.JsonNumpyCacheCoder()),
             value=[b'\x8a', [np.int64(1),
                              np.float32(2.5), 3, '4']]),
    )
    def test_coders_round_trip(self, coder, value):
        encoded = coder.encode_cache(value)
        np.testing.assert_equal(coder.decode_cache(encoded), value)

    def test_cache_helpers_round_trip(self):
        base_test_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)

        dataset_key_0 = 'dataset_key_0'
        dataset_key_1 = 'dataset_key_1'
        dataset_keys = (dataset_key_0, dataset_key_1)

        with beam.Pipeline() as p:
            cache_pcoll_dict = {
                dataset_key_0: {
                    b'\x8a': p | 'CreateA' >> beam.Create([b'[1, 2, 3]']),
                    b'\x8b': p | 'CreateB' >> beam.Create([b'[5]']),
                },
                dataset_key_1: {
                    b'\x8c': p | 'CreateC' >> beam.Create([b'[9, 5, 2, 1]']),
                },
            }

            _ = cache_pcoll_dict | analyzer_cache.WriteAnalysisCacheToFS(
                p, base_test_dir, dataset_keys)

        with beam.Pipeline() as p:
            read_cache = p | analyzer_cache.ReadAnalysisCacheFromFS(
                base_test_dir, list(cache_pcoll_dict.keys()))

            beam_test_util.assert_that(read_cache['dataset_key_0'][b'\x8a'],
                                       beam_test_util.equal_to([b'[1, 2, 3]']),
                                       label='AssertA')
            beam_test_util.assert_that(read_cache['dataset_key_0'][b'\x8b'],
                                       beam_test_util.equal_to([b'[5]']),
                                       label='AssertB')
            beam_test_util.assert_that(read_cache['dataset_key_1'][b'\x8c'],
                                       beam_test_util.equal_to(
                                           [b'[9, 5, 2, 1]']),
                                       label='AssertC')

    def test_cache_merge(self):
        base_test_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)

        dataset_key_0 = 'dataset_key_0'
        dataset_key_1 = 'dataset_key_1'
        dataset_keys = (dataset_key_0, dataset_key_1)
        cache_keys = list('abcd')

        def read_manifests():
            return [
                analyzer_cache._ManifestFile(
                    analyzer_cache._get_dataset_cache_path(base_test_dir,
                                                           key)).read()
                for key in dataset_keys
            ]

        with beam.Pipeline() as p:
            cache_pcoll_dict = {
                dataset_key_0: {
                    'a': p | 'CreateA' >> beam.Create([b'a']),
                    'b': p | 'CreateB' >> beam.Create([b'b']),
                },
                dataset_key_1: {
                    'c': p | 'CreateC' >> beam.Create([b'c']),
                    'd': p | 'CreateD' >> beam.Create([b'd']),
                },
            }
            _ = cache_pcoll_dict | analyzer_cache.WriteAnalysisCacheToFS(
                p, base_test_dir, dataset_keys)

        first_manifests = read_manifests()

        with beam.Pipeline() as p:
            cache_pcoll_dict = {
                dataset_key_0: {
                    'c': p | 'CreateC' >> beam.Create([b'c']),
                    'd': p | 'CreateD' >> beam.Create([b'd']),
                },
                dataset_key_1: {
                    'a': p | 'CreateA' >> beam.Create([b'a']),
                    'b': p | 'CreateB' >> beam.Create([b'b']),
                },
            }
            _ = cache_pcoll_dict | analyzer_cache.WriteAnalysisCacheToFS(
                p, base_test_dir, dataset_keys)

        second_manifests = read_manifests()
        self.assertEqual(len(first_manifests), len(second_manifests))
        for manifest_a, manifest_b in zip(first_manifests, second_manifests):
            for key_value_pair in manifest_a.items():
                self.assertIn(key_value_pair, manifest_b.items())

            self.assertEqual(2, len(manifest_a))
            self.assertCountEqual(range(len(manifest_a)), manifest_a.values())

            self.assertEqual(4, len(manifest_b))
            self.assertCountEqual(range(len(manifest_b)), manifest_b.values())
            self.assertCountEqual(cache_keys, manifest_b.keys())

    def test_cache_helpers_with_alternative_io(self):
        class LocalSink(beam.PTransform):
            def __init__(self, path):
                self._path = path

            def expand(self, pcoll):
                def write_to_file(value):
                    tf.io.gfile.makedirs(self._path)
                    with open(os.path.join(self._path, 'cache'), 'wb') as f:
                        f.write(value)

                return pcoll | beam.Map(write_to_file)

        test_cache_dict = {'a': {'b': [bytes([17, 19, 27, 31])]}}

        class LocalSource(beam.PTransform):
            def __init__(self, path):
                del path

            def expand(self, pbegin):
                return pbegin | beam.Create([test_cache_dict['a']['b']])

        dataset_keys = list(test_cache_dict.keys())
        cache_dir = self.get_temp_dir()
        with beam.Pipeline() as p:
            _ = test_cache_dict | analyzer_cache.WriteAnalysisCacheToFS(
                p, cache_dir, dataset_keys, sink=LocalSink)

            read_cache = p | analyzer_cache.ReadAnalysisCacheFromFS(
                cache_dir, dataset_keys, source=LocalSource)

            self.assertItemsEqual(read_cache.keys(), ['a'])
            self.assertItemsEqual(read_cache['a'].keys(), ['b'])

            beam_test_util.assert_that(
                read_cache['a']['b'],
                beam_test_util.equal_to([test_cache_dict['a']['b']]))
예제 #2
0
_MEAN_AND_VAR_SIMPLE_TEST = dict(
    testcase_name='WeightedMeanAndVarSimple',
    combiner=analyzers.WeightedMeanAndVarCombiner(np.float32,
                                                  output_shape=(),
                                                  compute_variance=False,
                                                  compute_weighted=False),
    batches=[
        _make_mean_and_var_accumulator_from_instance([[1, 2, 3, 4, 5, 6, 7]]),
        # Count is 5*0xFFFF=327675 for this accumulator.
        _make_mean_and_var_accumulator_from_instance([[8, 9, 10, 11, 12]] *
                                                     0xFFFF),
        _make_mean_and_var_accumulator_from_instance([[100, 200, 3000]]),
    ],
    expected_outputs=analyzers._WeightedMeanAndVarAccumulator(
        count=np.array(327685),
        mean=np.float32(10.00985092390558),
        weight=np.float32(1.0),
        variance=np.float32(0.0)))

_MEAN_AND_VAR_BIG_TEST = dict(
    testcase_name='WeightedMeanAndVarBig',
    combiner=analyzers.WeightedMeanAndVarCombiner(np.float32, output_shape=()),
    batches=[
        _make_mean_and_var_accumulator_from_instance([[1, 2, 3, 4, 5, 6, 7]]),
        _make_mean_and_var_accumulator_from_instance([[1e15, 2e15, 3000]]),
        _make_mean_and_var_accumulator_from_instance([[100, 200]]),
    ],
    expected_outputs=[
        np.float32(2.50e+14),
        np.float32(3.541666666665e+29),
    ],
예제 #3
0
def _make_mean_and_var_accumulator_from_instance(instance, axis=None):
    return analyzers._WeightedMeanAndVarAccumulator(
        count=np.sum(np.ones_like(instance), axis=axis),
        mean=np.mean(instance, axis=axis),
        weight=np.sum(np.ones_like(instance), axis=axis),
        variance=np.var(instance, axis=axis))
class AnalyzerCacheTest(test_case.TransformTestCase):
    def test_validate_dataset_keys(self):
        analyzer_cache.validate_dataset_keys(
            {'foo', 'Foo', 'A1', 'A_1', 'A.1', 'A-1'})

        for key in {'foo 1', 'foo@1', 'foo*', 'foo[]', 'foo/goo'}:
            with self.assertRaisesRegexp(
                    ValueError,
                    'Dataset key .* does not match allowed pattern:'):
                analyzer_cache.validate_dataset_keys({key})

    @test_case.named_parameters(
        dict(testcase_name='JsonNumpyCacheCoder',
             coder_cls=analyzer_nodes.JsonNumpyCacheCoder,
             value=[1, 2.5, 3, '4']),
        dict(testcase_name='_VocabularyAccumulatorCoderIntAccumulator',
             coder_cls=analyzer_nodes._VocabularyAccumulatorCoder,
             value=['A', 17]),
        dict(testcase_name='_VocabularyAccumulatorCoderClassAccumulator',
             coder_cls=analyzer_nodes._VocabularyAccumulatorCoder,
             value=[
                 'A',
                 analyzers._WeightedMeanAndVarAccumulator(
                     count=np.array(5),
                     mean=np.array([.4, .9, 1.5]),
                     variance=np.array([.1, .4, .5]),
                     weight=np.array(0.),
                 )
             ]),
    )
    def test_coders_round_trip(self, coder_cls, value):
        coder = coder_cls()
        encoded = coder.encode_cache(value)
        np.testing.assert_equal(value, coder.decode_cache(encoded))

    def test_cache_helpers_round_trip(self):
        base_test_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)

        with beam.Pipeline() as p:
            cache_pcoll_dict = {
                'dataset_key_0': {
                    'a': p | 'CreateA' >> beam.Create([b'[1, 2, 3]']),
                    'b': p | 'CreateB' >> beam.Create([b'[5]']),
                },
                'dataset_key_1': {
                    'c': p | 'CreateC' >> beam.Create([b'[9, 5, 2, 1]']),
                },
            }
            _ = cache_pcoll_dict | analyzer_cache.WriteAnalysisCacheToFS(
                base_test_dir)

        with beam.Pipeline() as p:
            read_cache = p | analyzer_cache.ReadAnalysisCacheFromFS(
                base_test_dir, list(cache_pcoll_dict.keys()))

            def assert_equal_matcher(expected_encoded):
                def _assert_equal(encoded_cache_list):
                    (encode_cache, ) = encoded_cache_list
                    self.assertEqual(expected_encoded, encode_cache)

                return _assert_equal

            beam_test_util.assert_that(read_cache['dataset_key_0']['a'],
                                       beam_test_util.equal_to([b'[1, 2, 3]']),
                                       label='AssertA')
            beam_test_util.assert_that(read_cache['dataset_key_0']['b'],
                                       assert_equal_matcher(b'[5]'),
                                       label='AssertB')
            beam_test_util.assert_that(read_cache['dataset_key_1']['c'],
                                       assert_equal_matcher(b'[9, 5, 2, 1]'),
                                       label='AssertC')

    def test_cache_helpers_with_alternative_io(self):
        class LocalSink(beam.PTransform):
            def __init__(self, path, file_name_suffix):
                del file_name_suffix
                self._path = path

            def expand(self, pcoll):
                def write_to_file(value):
                    tf.io.gfile.makedirs(self._path)
                    with open(os.path.join(self._path, 'cache'), 'w') as f:
                        f.write(value)

                return pcoll | beam.Map(write_to_file)

        test_cache_dict = {'a': {'b': [str([17, 19, 27, 31])]}}

        class LocalSource(beam.PTransform):
            def __init__(self, path):
                del path

            def expand(self, pbegin):
                return pbegin | beam.Create([test_cache_dict['a']['b']])

        cache_dir = self.get_temp_dir()
        with beam.Pipeline() as p:
            _ = test_cache_dict | analyzer_cache.WriteAnalysisCacheToFS(
                cache_dir, sink=LocalSink)

            read_cache = p | analyzer_cache.ReadAnalysisCacheFromFS(
                cache_dir, list(test_cache_dict.keys()), source=LocalSource)

            self.assertItemsEqual(read_cache.keys(), ['a'])
            self.assertItemsEqual(read_cache['a'].keys(), ['b'])

            beam_test_util.assert_that(
                read_cache['a']['b'],
                beam_test_util.equal_to([test_cache_dict['a']['b']]))