def benchmark_nested_directories(self):
        tmp_dir = tempfile.mkdtemp()
        width = 500
        depth = 10
        for i in range(width):
            for j in range(depth):
                new_base = os.path.join(
                    tmp_dir, str(i), *[str(dir_name) for dir_name in range(j)])
                os.makedirs(new_base)
                child_files = ['a.py', 'b.pyc'
                               ] if j < depth - 1 else ['c.txt', 'd.log']
                for f in child_files:
                    filename = os.path.join(new_base, f)
                    open(filename, 'w').close()

        patterns = [
            os.path.join(tmp_dir, os.path.join(*['**' for _ in range(depth)]),
                         suffix) for suffix in ['*.txt', '*.log']
        ]
        # the num_elements depends on the pattern that has been defined above.
        # In the current scenario, the num of files are selected based on the
        # ['*.txt', '*.log'] patterns. Since the files which match either of these
        # patterns are created once per `width`. The num_elements would be:
        num_elements = width * 2

        dataset = matching_files.MatchingFilesDataset(patterns)
        self.run_and_report_benchmark(dataset=dataset,
                                      iters=3,
                                      num_elements=num_elements,
                                      name='nested_directory(%d*%d)' %
                                      (width, depth))

        shutil.rmtree(tmp_dir, ignore_errors=True)
    def testEmptyDirectory(self):
        """Test the MatchingFiles dataset with an empty directory."""

        dataset = matching_files.MatchingFilesDataset(
            os.path.join(self.tmp_dir, '*'))
        self.assertDatasetProduces(dataset,
                                   expected_error=(errors.NotFoundError, ''))
Exemplo n.º 3
0
    def benchmarkNestedDirectories(self):
        tmp_dir = tempfile.mkdtemp()
        width = 500
        depth = 10
        for i in range(width):
            for j in range(depth):
                new_base = os.path.join(
                    tmp_dir, str(i), *[str(dir_name) for dir_name in range(j)])
                os.makedirs(new_base)
                child_files = ['a.py', 'b.pyc'
                               ] if j < depth - 1 else ['c.txt', 'd.log']
                for f in child_files:
                    filename = os.path.join(new_base, f)
                    open(filename, 'w').close()

        patterns = [
            os.path.join(tmp_dir, os.path.join(*['**' for _ in range(depth)]),
                         suffix) for suffix in ['*.txt', '*.log']
        ]

        deltas = []
        iters = 3
        for _ in range(iters):
            with ops.Graph().as_default():
                dataset = matching_files.MatchingFilesDataset(patterns)
                next_element = dataset_ops.make_one_shot_iterator(
                    dataset).get_next()

                with session.Session() as sess:
                    sub_deltas = []
                    while True:
                        try:
                            start = time.time()
                            sess.run(next_element)
                            end = time.time()
                            sub_deltas.append(end - start)
                        except errors.OutOfRangeError:
                            break
                    deltas.append(sub_deltas)

        median_deltas = np.median(deltas, axis=0)
        print('Nested directory size (width*depth): %d*%d Median wall time: '
              '%fs (read first filename), %fs (read second filename), avg %fs'
              ' (read %d more filenames)' %
              (width, depth, median_deltas[0], median_deltas[1],
               np.average(median_deltas[2:]), len(median_deltas) - 2))
        self.report_benchmark(
            iters=iters,
            wall_time=np.sum(median_deltas),
            extras={
                'read first file:':
                median_deltas[0],
                'read second file:':
                median_deltas[1],
                'avg time for reading %d more filenames:' % (len(median_deltas) - 2):
                np.average(median_deltas[2:])
            },
            name='dataset_nested_directory(%d*%d)' % (width, depth))

        shutil.rmtree(tmp_dir, ignore_errors=True)
    def testNonExistingDirectory(self):
        """Test the MatchingFiles dataset with a non-existing directory."""

        self.tmp_dir = os.path.join(self.tmp_dir, 'nonexistingdir')
        dataset = matching_files.MatchingFilesDataset(
            os.path.join(self.tmp_dir, '*'))
        self.assertDatasetProduces(dataset,
                                   expected_error=(errors.NotFoundError, ''))
Exemplo n.º 5
0
  def testEmptyDirectory(self):
    """Test the MatchingFiles dataset with an empty directory."""

    dataset = matching_files.MatchingFilesDataset(
        os.path.join(self.tmp_dir, '*'))
    with self.cached_session() as sess:
      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
      with self.assertRaises(errors.NotFoundError):
        sess.run(next_element)
Exemplo n.º 6
0
  def testNonExistingDirectory(self):
    """Test the MatchingFiles dataset with a non-existing directory."""

    self.tmp_dir = os.path.join(self.tmp_dir, 'nonexistingdir')
    dataset = matching_files.MatchingFilesDataset(
        os.path.join(self.tmp_dir, '*'))
    with self.cached_session() as sess:
      next_element = dataset.make_one_shot_iterator().get_next()
      with self.assertRaises(errors.NotFoundError):
        sess.run(next_element)
    def testFileMiddles(self):
        """Test the MatchingFiles dataset using the middles of filename."""

        filenames = ['aa.txt', 'bb.py', 'bbc.pyc', 'cc.pyc']
        self._touchTempFiles(filenames)

        dataset = matching_files.MatchingFilesDataset(
            os.path.join(self.tmp_dir, 'b*.py*'))
        self.assertDatasetProduces(
            dataset,
            expected_output=[
                compat.as_bytes(os.path.join(self.tmp_dir, filename))
                for filename in filenames[1:3]
            ],
            assert_items_equal=True)
    def testSimpleDirectory(self):
        """Test the MatchingFiles dataset with a simple directory."""

        filenames = ['a', 'b', 'c']
        self._touchTempFiles(filenames)

        dataset = matching_files.MatchingFilesDataset(
            os.path.join(self.tmp_dir, '*'))
        self.assertDatasetProduces(
            dataset,
            expected_output=[
                compat.as_bytes(os.path.join(self.tmp_dir, filename))
                for filename in filenames
            ],
            assert_items_equal=True)
Exemplo n.º 9
0
  def testFileSuffixes(self):
    """Test the MatchingFiles dataset using the suffixes of filename."""

    filenames = ['a.txt', 'b.py', 'c.py', 'd.pyc']
    self._touchTempFiles(filenames)

    dataset = matching_files.MatchingFilesDataset(
        os.path.join(self.tmp_dir, '*.py'))
    with self.cached_session() as sess:
      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
      expected_filenames = []
      actual_filenames = []
      for filename in filenames[1:-1]:
        expected_filenames.append(
            compat.as_bytes(os.path.join(self.tmp_dir, filename)))
        actual_filenames.append(compat.as_bytes(sess.run(next_element)))

      self.assertItemsEqual(expected_filenames, actual_filenames)
      with self.assertRaises(errors.OutOfRangeError):
        sess.run(next_element)
Exemplo n.º 10
0
  def testSimpleDirectory(self):
    """Test the MatchingFiles dataset with a simple directory."""

    filenames = ['a', 'b', 'c']
    self._touchTempFiles(filenames)

    dataset = matching_files.MatchingFilesDataset(
        os.path.join(self.tmp_dir, '*'))
    with self.cached_session() as sess:
      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()

      expected_filenames = []
      actual_filenames = []
      for filename in filenames:
        expected_filenames.append(
            compat.as_bytes(os.path.join(self.tmp_dir, filename)))
        actual_filenames.append(compat.as_bytes(sess.run(next_element)))

      self.assertItemsEqual(expected_filenames, actual_filenames)
      with self.assertRaises(errors.OutOfRangeError):
        sess.run(next_element)
    def testNestedDirectories(self):
        """Test the MatchingFiles dataset with nested directories."""

        filenames = []
        width = 8
        depth = 4
        for i in range(width):
            for j in range(depth):
                new_base = os.path.join(
                    self.tmp_dir, str(i),
                    *[str(dir_name) for dir_name in range(j)])
                os.makedirs(new_base)
                child_files = ['a.py', 'b.pyc'
                               ] if j < depth - 1 else ['c.txt', 'd.log']
                for f in child_files:
                    filename = os.path.join(new_base, f)
                    filenames.append(filename)
                    open(filename, 'w').close()

        patterns = [
            os.path.join(self.tmp_dir,
                         os.path.join(*['**' for _ in range(depth)]), suffix)
            for suffix in ['*.txt', '*.log']
        ]

        dataset = matching_files.MatchingFilesDataset(patterns)
        next_element = self.getNext(dataset)
        expected_filenames = [
            compat.as_bytes(filename) for filename in filenames
            if filename.endswith('.txt') or filename.endswith('.log')
        ]
        actual_filenames = []
        while True:
            try:
                actual_filenames.append(
                    compat.as_bytes(self.evaluate(next_element())))
            except errors.OutOfRangeError:
                break

        self.assertCountEqual(expected_filenames, actual_filenames)
Exemplo n.º 12
0
 def _build_iterator_graph(self, test_patterns):
     return matching_files.MatchingFilesDataset(test_patterns)