def test_cache_merge(self): base_test_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) dataset_key_0 = analyzer_cache.DatasetKey('dataset_key_0') dataset_key_1 = analyzer_cache.DatasetKey('dataset_key_1') dataset_keys = (dataset_key_0, dataset_key_1) cache_keys = list('abcd') def read_manifests(): return [ analyzer_cache._ManifestFile( analyzer_cache._get_dataset_cache_path(base_test_dir, key)).read() for key in dataset_keys ] with beam.Pipeline() as p: cache_pcoll_dict = { dataset_key_0: { 'a': p | 'CreateA' >> beam.Create([b'a']), 'b': p | 'CreateB' >> beam.Create([b'b']), }, dataset_key_1: { 'c': p | 'CreateC' >> beam.Create([b'c']), 'd': p | 'CreateD' >> beam.Create([b'd']), }, } _ = cache_pcoll_dict | analyzer_cache.WriteAnalysisCacheToFS( p, base_test_dir, dataset_keys) first_manifests = read_manifests() with beam.Pipeline() as p: cache_pcoll_dict = { dataset_key_0: { 'c': p | 'CreateC' >> beam.Create([b'c']), 'd': p | 'CreateD' >> beam.Create([b'd']), }, dataset_key_1: { 'a': p | 'CreateA' >> beam.Create([b'a']), 'b': p | 'CreateB' >> beam.Create([b'b']), }, } _ = cache_pcoll_dict | analyzer_cache.WriteAnalysisCacheToFS( p, base_test_dir, dataset_keys) second_manifests = read_manifests() self.assertEqual(len(first_manifests), len(second_manifests)) for manifest_a, manifest_b in zip(first_manifests, second_manifests): for key_value_pair in manifest_a.items(): self.assertIn(key_value_pair, manifest_b.items()) self.assertEqual(2, len(manifest_a)) self.assertCountEqual(range(len(manifest_a)), manifest_a.values()) self.assertEqual(4, len(manifest_b)) self.assertCountEqual(range(len(manifest_b)), manifest_b.values()) self.assertCountEqual(cache_keys, manifest_b.keys())
def test_validate_dataset_keys(self): analyzer_cache.validate_dataset_keys({ analyzer_cache.DatasetKey(k) for k in ('foo', 'Foo', 'A1', 'A_1', 'A.1', 'A-1', 'foo@1', 'foo*', 'foo[]', 'foo/goo') }) for key in {analyzer_cache.DatasetKey(k) for k in ('^foo^', 'foo 1')}: with self.assertRaisesRegexp( ValueError, 'Dataset key .* does not match allowed pattern:'): analyzer_cache.validate_dataset_keys({key})
def test_cache_write_empty(self): base_test_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) with beam.Pipeline() as p: _ = {} | analyzer_cache.WriteAnalysisCacheToFS( p, base_test_dir, (analyzer_cache.DatasetKey('dataset_key_0'),)) self.assertFalse(os.path.isdir(base_test_dir))
def test_cache_helpers_round_trip(self): base_test_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) dataset_key_0 = analyzer_cache.DatasetKey('dataset_key_0') dataset_key_1 = analyzer_cache.DatasetKey('dataset_key_1') dataset_keys = (dataset_key_0, dataset_key_1) with beam.Pipeline() as p: cache_pcoll_dict = { dataset_key_0: { b'\x8a': p | 'CreateA' >> beam.Create([b'[1, 2, 3]']), b'\x8b': p | 'CreateB' >> beam.Create([b'[5]']), b'\x8b1': p | 'CreateB1' >> beam.Create([b'[6]']), }, dataset_key_1: { b'\x8c': p | 'CreateC' >> beam.Create([b'[9, 5, 2, 1]']), }, } _ = cache_pcoll_dict | analyzer_cache.WriteAnalysisCacheToFS( p, base_test_dir, dataset_keys) with beam.Pipeline() as p: read_cache = p | analyzer_cache.ReadAnalysisCacheFromFS( base_test_dir, list(cache_pcoll_dict.keys()), [b'\x8a', b'\x8b', b'\x8c']) beam_test_util.assert_that( read_cache[dataset_key_0][b'\x8a'], beam_test_util.equal_to([b'[1, 2, 3]']), label='AssertA') beam_test_util.assert_that( read_cache[dataset_key_0][b'\x8b'], beam_test_util.equal_to([b'[5]']), label='AssertB') beam_test_util.assert_that( read_cache[dataset_key_1][b'\x8c'], beam_test_util.equal_to([b'[9, 5, 2, 1]']), label='AssertC')
def test_cache_helpers_with_alternative_io(self): class LocalSink(beam.PTransform): def __init__(self, path): self._path = path def expand(self, pcoll): def write_to_file(value): tf.io.gfile.makedirs(self._path) with open(os.path.join(self._path, 'cache'), 'wb') as f: f.write(value) return pcoll | beam.Map(write_to_file) test_cache_dict = { analyzer_cache.DatasetKey('a'): { 'b': [bytes([17, 19, 27, 31])] } } class LocalSource(beam.PTransform): def __init__(self, path): del path def expand(self, pbegin): return pbegin | beam.Create([test_cache_dict['a']['b']]) dataset_keys = list(test_cache_dict.keys()) cache_dir = self.get_temp_dir() with beam.Pipeline() as p: _ = test_cache_dict | analyzer_cache.WriteAnalysisCacheToFS( p, cache_dir, dataset_keys, sink=LocalSink) read_cache = p | analyzer_cache.ReadAnalysisCacheFromFS( cache_dir, dataset_keys, source=LocalSource) self.assertItemsEqual(read_cache.keys(), ['a']) self.assertItemsEqual(read_cache['a'].keys(), ['b']) beam_test_util.assert_that( read_cache['a']['b'], beam_test_util.equal_to([test_cache_dict['a']['b']]))
def __init__(self, file_pattern: Text, materialize_output_path: Optional[Text] = None): """Initialize a Dataset. Args: file_pattern: The file pattern of the dataset. materialize_output_path: The file path where to write the dataset. """ file_pattern_suffix = os.path.join( *file_pattern.split(os.sep)[-self._FILE_PATTERN_SUFFIX_LENGTH:]) self._file_pattern = file_pattern self._materialize_output_path = materialize_output_path self._index = None self._serialized = None self._decoded = None self._transformed = None self._transformed_and_serialized = None if hasattr(analyzer_cache, 'DatasetKey'): self._dataset_key = analyzer_cache.DatasetKey(file_pattern_suffix) else: self._dataset_key = analyzer_cache.make_dataset_key(file_pattern_suffix) print('-'*50) print(self._dataset_key) print('-'*50)