def benchmark_nested_directories(self): tmp_dir = tempfile.mkdtemp() width = 500 depth = 10 for i in range(width): for j in range(depth): new_base = os.path.join( tmp_dir, str(i), *[str(dir_name) for dir_name in range(j)]) os.makedirs(new_base) child_files = ['a.py', 'b.pyc' ] if j < depth - 1 else ['c.txt', 'd.log'] for f in child_files: filename = os.path.join(new_base, f) open(filename, 'w').close() patterns = [ os.path.join(tmp_dir, os.path.join(*['**' for _ in range(depth)]), suffix) for suffix in ['*.txt', '*.log'] ] # the num_elements depends on the pattern that has been defined above. # In the current scenario, the num of files are selected based on the # ['*.txt', '*.log'] patterns. Since the files which match either of these # patterns are created once per `width`. The num_elements would be: num_elements = width * 2 dataset = matching_files.MatchingFilesDataset(patterns) self.run_and_report_benchmark(dataset=dataset, iters=3, num_elements=num_elements, name='nested_directory(%d*%d)' % (width, depth)) shutil.rmtree(tmp_dir, ignore_errors=True)
def testEmptyDirectory(self): """Test the MatchingFiles dataset with an empty directory.""" dataset = matching_files.MatchingFilesDataset( os.path.join(self.tmp_dir, '*')) self.assertDatasetProduces(dataset, expected_error=(errors.NotFoundError, ''))
def benchmarkNestedDirectories(self): tmp_dir = tempfile.mkdtemp() width = 500 depth = 10 for i in range(width): for j in range(depth): new_base = os.path.join( tmp_dir, str(i), *[str(dir_name) for dir_name in range(j)]) os.makedirs(new_base) child_files = ['a.py', 'b.pyc' ] if j < depth - 1 else ['c.txt', 'd.log'] for f in child_files: filename = os.path.join(new_base, f) open(filename, 'w').close() patterns = [ os.path.join(tmp_dir, os.path.join(*['**' for _ in range(depth)]), suffix) for suffix in ['*.txt', '*.log'] ] deltas = [] iters = 3 for _ in range(iters): with ops.Graph().as_default(): dataset = matching_files.MatchingFilesDataset(patterns) next_element = dataset_ops.make_one_shot_iterator( dataset).get_next() with session.Session() as sess: sub_deltas = [] while True: try: start = time.time() sess.run(next_element) end = time.time() sub_deltas.append(end - start) except errors.OutOfRangeError: break deltas.append(sub_deltas) median_deltas = np.median(deltas, axis=0) print('Nested directory size (width*depth): %d*%d Median wall time: ' '%fs (read first filename), %fs (read second filename), avg %fs' ' (read %d more filenames)' % (width, depth, median_deltas[0], median_deltas[1], np.average(median_deltas[2:]), len(median_deltas) - 2)) self.report_benchmark( iters=iters, wall_time=np.sum(median_deltas), extras={ 'read first file:': median_deltas[0], 'read second file:': median_deltas[1], 'avg time for reading %d more filenames:' % (len(median_deltas) - 2): np.average(median_deltas[2:]) }, name='dataset_nested_directory(%d*%d)' % (width, depth)) shutil.rmtree(tmp_dir, ignore_errors=True)
def testNonExistingDirectory(self): """Test the MatchingFiles dataset with a non-existing directory.""" self.tmp_dir = os.path.join(self.tmp_dir, 'nonexistingdir') dataset = matching_files.MatchingFilesDataset( os.path.join(self.tmp_dir, '*')) self.assertDatasetProduces(dataset, expected_error=(errors.NotFoundError, ''))
def testEmptyDirectory(self): """Test the MatchingFiles dataset with an empty directory.""" dataset = matching_files.MatchingFilesDataset( os.path.join(self.tmp_dir, '*')) with self.cached_session() as sess: next_element = dataset_ops.make_one_shot_iterator(dataset).get_next() with self.assertRaises(errors.NotFoundError): sess.run(next_element)
def testNonExistingDirectory(self): """Test the MatchingFiles dataset with a non-existing directory.""" self.tmp_dir = os.path.join(self.tmp_dir, 'nonexistingdir') dataset = matching_files.MatchingFilesDataset( os.path.join(self.tmp_dir, '*')) with self.cached_session() as sess: next_element = dataset.make_one_shot_iterator().get_next() with self.assertRaises(errors.NotFoundError): sess.run(next_element)
def testFileMiddles(self): """Test the MatchingFiles dataset using the middles of filename.""" filenames = ['aa.txt', 'bb.py', 'bbc.pyc', 'cc.pyc'] self._touchTempFiles(filenames) dataset = matching_files.MatchingFilesDataset( os.path.join(self.tmp_dir, 'b*.py*')) self.assertDatasetProduces( dataset, expected_output=[ compat.as_bytes(os.path.join(self.tmp_dir, filename)) for filename in filenames[1:3] ], assert_items_equal=True)
def testSimpleDirectory(self): """Test the MatchingFiles dataset with a simple directory.""" filenames = ['a', 'b', 'c'] self._touchTempFiles(filenames) dataset = matching_files.MatchingFilesDataset( os.path.join(self.tmp_dir, '*')) self.assertDatasetProduces( dataset, expected_output=[ compat.as_bytes(os.path.join(self.tmp_dir, filename)) for filename in filenames ], assert_items_equal=True)
def testFileSuffixes(self): """Test the MatchingFiles dataset using the suffixes of filename.""" filenames = ['a.txt', 'b.py', 'c.py', 'd.pyc'] self._touchTempFiles(filenames) dataset = matching_files.MatchingFilesDataset( os.path.join(self.tmp_dir, '*.py')) with self.cached_session() as sess: next_element = dataset_ops.make_one_shot_iterator(dataset).get_next() expected_filenames = [] actual_filenames = [] for filename in filenames[1:-1]: expected_filenames.append( compat.as_bytes(os.path.join(self.tmp_dir, filename))) actual_filenames.append(compat.as_bytes(sess.run(next_element))) self.assertItemsEqual(expected_filenames, actual_filenames) with self.assertRaises(errors.OutOfRangeError): sess.run(next_element)
def testSimpleDirectory(self): """Test the MatchingFiles dataset with a simple directory.""" filenames = ['a', 'b', 'c'] self._touchTempFiles(filenames) dataset = matching_files.MatchingFilesDataset( os.path.join(self.tmp_dir, '*')) with self.cached_session() as sess: next_element = dataset_ops.make_one_shot_iterator(dataset).get_next() expected_filenames = [] actual_filenames = [] for filename in filenames: expected_filenames.append( compat.as_bytes(os.path.join(self.tmp_dir, filename))) actual_filenames.append(compat.as_bytes(sess.run(next_element))) self.assertItemsEqual(expected_filenames, actual_filenames) with self.assertRaises(errors.OutOfRangeError): sess.run(next_element)
def testNestedDirectories(self): """Test the MatchingFiles dataset with nested directories.""" filenames = [] width = 8 depth = 4 for i in range(width): for j in range(depth): new_base = os.path.join( self.tmp_dir, str(i), *[str(dir_name) for dir_name in range(j)]) os.makedirs(new_base) child_files = ['a.py', 'b.pyc' ] if j < depth - 1 else ['c.txt', 'd.log'] for f in child_files: filename = os.path.join(new_base, f) filenames.append(filename) open(filename, 'w').close() patterns = [ os.path.join(self.tmp_dir, os.path.join(*['**' for _ in range(depth)]), suffix) for suffix in ['*.txt', '*.log'] ] dataset = matching_files.MatchingFilesDataset(patterns) next_element = self.getNext(dataset) expected_filenames = [ compat.as_bytes(filename) for filename in filenames if filename.endswith('.txt') or filename.endswith('.log') ] actual_filenames = [] while True: try: actual_filenames.append( compat.as_bytes(self.evaluate(next_element()))) except errors.OutOfRangeError: break self.assertCountEqual(expected_filenames, actual_filenames)
def _build_iterator_graph(self, test_patterns): return matching_files.MatchingFilesDataset(test_patterns)