Пример #1
0
  def test_seek_set(self):
    for compression_type in [CompressionTypes.BZIP2, CompressionTypes.GZIP]:
      file_name = self._create_compressed_file(compression_type, self.content)
      with open(file_name, 'rb') as f:
        compressed_fd = CompressedFile(f, compression_type,
                                       read_size=self.read_block_size)
        reference_fd = StringIO(self.content)

        # Note: content (readline) check must come before position (tell) check
        # because cStringIO's tell() reports out of bound positions (if we seek
        # beyond the file) up until a real read occurs.
        # _CompressedFile.tell() always stays within the bounds of the
        # uncompressed content.
        for seek_position in (-1, 0, 1,
                              len(self.content)-1, len(self.content),
                              len(self.content) + 1):
          compressed_fd.seek(seek_position, os.SEEK_SET)
          reference_fd.seek(seek_position, os.SEEK_SET)

          uncompressed_line = compressed_fd.readline()
          reference_line = reference_fd.readline()
          self.assertEqual(uncompressed_line, reference_line)

          uncompressed_position = compressed_fd.tell()
          reference_position = reference_fd.tell()
          self.assertEqual(uncompressed_position, reference_position)
Пример #2
0
  def test_seek_set(self):
    for compression_type in [CompressionTypes.BZIP2, CompressionTypes.DEFLATE,
                             CompressionTypes.GZIP]:
      file_name = self._create_compressed_file(compression_type, self.content)
      with open(file_name, 'rb') as f:
        compressed_fd = CompressedFile(f, compression_type,
                                       read_size=self.read_block_size)
        reference_fd = BytesIO(self.content)

        # Note: BytesIO's tell() reports out of bound positions (if we seek
        # beyond the file), therefore we need to cap it to max_position
        # _CompressedFile.tell() always stays within the bounds of the
        # uncompressed content.
        # Negative seek position argument is not supported for BytesIO with
        # whence set to SEEK_SET.
        for seek_position in (0, 1,
                              len(self.content)-1, len(self.content),
                              len(self.content) + 1):
          compressed_fd.seek(seek_position, os.SEEK_SET)
          reference_fd.seek(seek_position, os.SEEK_SET)

          uncompressed_line = compressed_fd.readline()
          reference_line = reference_fd.readline()
          self.assertEqual(uncompressed_line, reference_line)

          uncompressed_position = compressed_fd.tell()
          reference_position = reference_fd.tell()
          max_position = len(self.content)
          reference_position = min(reference_position, max_position)
          self.assertEqual(uncompressed_position, reference_position)
Пример #3
0
  def test_seek_cur(self):
    for compression_type in [CompressionTypes.BZIP2, CompressionTypes.DEFLATE,
                             CompressionTypes.GZIP]:
      file_name = self._create_compressed_file(compression_type, self.content)
      with open(file_name, 'rb') as f:
        compressed_fd = CompressedFile(f, compression_type,
                                       read_size=self.read_block_size)
        reference_fd = BytesIO(self.content)

        # Test out of bound, inbound seeking in both directions
        # Note: BytesIO's seek() reports out of bound positions (if we seek
        # beyond the file), therefore we need to cap it to max_position (to
        # make it consistent with the old StringIO behavior
        for seek_position in (-1, 0, 1,
                              len(self.content) // 2,
                              len(self.content) // 2,
                              -1 * len(self.content) // 2):
          compressed_fd.seek(seek_position, os.SEEK_CUR)
          reference_fd.seek(seek_position, os.SEEK_CUR)

          uncompressed_line = compressed_fd.readline()
          expected_line = reference_fd.readline()
          self.assertEqual(uncompressed_line, expected_line)

          reference_position = reference_fd.tell()
          uncompressed_position = compressed_fd.tell()
          max_position = len(self.content)
          reference_position = min(reference_position, max_position)
          reference_fd.seek(reference_position, os.SEEK_SET)
          self.assertEqual(uncompressed_position, reference_position)
Пример #4
0
  def test_read_and_seek_back_to_beginning(self):
    for compression_type in [CompressionTypes.BZIP2, CompressionTypes.GZIP]:
      file_name = self._create_compressed_file(compression_type, self.content)
      with open(file_name, 'rb') as f:
        compressed_fd = CompressedFile(f, compression_type,
                                       read_size=self.read_block_size)

        first_pass = compressed_fd.readline()
        compressed_fd.seek(0, os.SEEK_SET)
        second_pass = compressed_fd.readline()

        self.assertEqual(first_pass, second_pass)
Пример #5
0
  def test_read_from_end_returns_no_data(self):
    for compression_type in [CompressionTypes.BZIP2, CompressionTypes.GZIP]:
      file_name = self._create_compressed_file(compression_type, self.content)
      with open(file_name, 'rb') as f:
        compressed_fd = CompressedFile(f, compression_type,
                                       read_size=self.read_block_size)

        seek_position = 0
        compressed_fd.seek(seek_position, os.SEEK_END)

        expected_data = ''
        uncompressed_data = compressed_fd.read(10)

        self.assertEqual(uncompressed_data, expected_data)
Пример #6
0
    def test_seek_outside(self):
        for compression_type in [
                CompressionTypes.BZIP2, CompressionTypes.DEFLATE,
                CompressionTypes.GZIP
        ]:
            file_name = self._create_compressed_file(compression_type,
                                                     self.content)
            with open(file_name, 'rb') as f:
                compressed_fd = CompressedFile(f,
                                               compression_type,
                                               read_size=self.read_block_size)

                for whence in (os.SEEK_CUR, os.SEEK_SET, os.SEEK_END):
                    seek_position = -1 * len(self.content) - 10
                    compressed_fd.seek(seek_position, whence)

                    expected_position = 0
                    uncompressed_position = compressed_fd.tell()
                    self.assertEqual(uncompressed_position, expected_position)

                    seek_position = len(self.content) + 20
                    compressed_fd.seek(seek_position, whence)

                    expected_position = len(self.content)
                    uncompressed_position = compressed_fd.tell()
                    self.assertEqual(uncompressed_position, expected_position)
Пример #7
0
  def test_read_from_end_returns_no_data(self):
    for compression_type in [CompressionTypes.BZIP2, CompressionTypes.GZIP]:
      file_name = self._create_compressed_file(compression_type, self.content)
      with open(file_name, 'rb') as f:
        compressed_fd = CompressedFile(f, compression_type,
                                       read_size=self.read_block_size)

        seek_position = 0
        compressed_fd.seek(seek_position, os.SEEK_END)

        expected_data = ''
        uncompressed_data = compressed_fd.read(10)

        self.assertEqual(uncompressed_data, expected_data)
Пример #8
0
  def _add_compression(stream, path, mime_type, compression_type):
    if mime_type != 'application/octet-stream':
      logging.warning('Mime types are not supported. Got non-default mime_type:'
                      ' %s', mime_type)
    if compression_type == CompressionTypes.AUTO:
      compression_type = CompressionTypes.detect_compression_type(path)
    if compression_type != CompressionTypes.UNCOMPRESSED:
      return CompressedFile(stream)

    return stream
Пример #9
0
 def _path_open(self, path, mode, mime_type='application/octet-stream',
                compression_type=CompressionTypes.AUTO):
   """Helper functions to open a file in the provided mode.
   """
   compression_type = FileSystem._get_compression_type(path, compression_type)
   mime_type = CompressionTypes.mime_type(compression_type, mime_type)
   raw_file = s3io.S3IO().open(path, mode, mime_type=mime_type)
   if compression_type == CompressionTypes.UNCOMPRESSED:
     return raw_file
   return CompressedFile(raw_file, compression_type=compression_type)
Пример #10
0
 def _open_hdfs(self, path, mode, mime_type, compression_type):
     if mime_type != 'application/octet-stream':
         logging.warning(
             'Mime types are not supported. Got non-default mime_type:'
             ' %s', mime_type)
     if compression_type == CompressionTypes.AUTO:
         compression_type = CompressionTypes.detect_compression_type(path)
     res = self._hdfs_client.open(path, mode)
     if compression_type != CompressionTypes.UNCOMPRESSED:
         res = CompressedFile(res)
     return res
Пример #11
0
    def test_seek_cur(self):
        for compression_type in [
                CompressionTypes.BZIP2, CompressionTypes.GZIP
        ]:
            file_name = self._create_compressed_file(compression_type,
                                                     self.content)
            with open(file_name, 'rb') as f:
                compressed_fd = CompressedFile(f,
                                               compression_type,
                                               read_size=self.read_block_size)
                reference_fd = StringIO(self.content)

                # Test out of bound, inbound seeking in both directions
                for seek_position in (-1, 0, 1, len(self.content) / 2,
                                      len(self.content) / 2,
                                      -1 * len(self.content) / 2):
                    compressed_fd.seek(seek_position, os.SEEK_CUR)
                    reference_fd.seek(seek_position, os.SEEK_CUR)

                    uncompressed_line = compressed_fd.readline()
                    expected_line = reference_fd.readline()
                    self.assertEqual(uncompressed_line, expected_line)

                    reference_position = reference_fd.tell()
                    uncompressed_position = compressed_fd.tell()
                    self.assertEqual(uncompressed_position, reference_position)
Пример #12
0
    def test_seek_set(self):
        for compression_type in [
                CompressionTypes.BZIP2, CompressionTypes.DEFLATE,
                CompressionTypes.GZIP
        ]:
            file_name = self._create_compressed_file(compression_type,
                                                     self.content)
            with open(file_name, 'rb') as f:
                compressed_fd = CompressedFile(f,
                                               compression_type,
                                               read_size=self.read_block_size)
                reference_fd = BytesIO(self.content)

                # Note: BytesIO's tell() reports out of bound positions (if we seek
                # beyond the file), therefore we need to cap it to max_position
                # _CompressedFile.tell() always stays within the bounds of the
                # uncompressed content.
                # Negative seek position argument is not supported for BytesIO with
                # whence set to SEEK_SET.
                for seek_position in (0, 1, len(self.content) - 1,
                                      len(self.content),
                                      len(self.content) + 1):
                    compressed_fd.seek(seek_position, os.SEEK_SET)
                    reference_fd.seek(seek_position, os.SEEK_SET)

                    uncompressed_line = compressed_fd.readline()
                    reference_line = reference_fd.readline()
                    self.assertEqual(uncompressed_line, reference_line)

                    uncompressed_position = compressed_fd.tell()
                    reference_position = reference_fd.tell()
                    max_position = len(self.content)
                    reference_position = min(reference_position, max_position)
                    self.assertEqual(uncompressed_position, reference_position)
Пример #13
0
    def test_seek_cur(self):
        for compression_type in [
                CompressionTypes.BZIP2, CompressionTypes.DEFLATE,
                CompressionTypes.GZIP
        ]:
            file_name = self._create_compressed_file(compression_type,
                                                     self.content)
            with open(file_name, 'rb') as f:
                compressed_fd = CompressedFile(f,
                                               compression_type,
                                               read_size=self.read_block_size)
                reference_fd = BytesIO(self.content)

                # Test out of bound, inbound seeking in both directions
                # Note: BytesIO's seek() reports out of bound positions (if we seek
                # beyond the file), therefore we need to cap it to max_position (to
                # make it consistent with the old StringIO behavior
                for seek_position in (-1, 0, 1, len(self.content) // 2,
                                      len(self.content) // 2,
                                      -1 * len(self.content) // 2):
                    compressed_fd.seek(seek_position, os.SEEK_CUR)
                    reference_fd.seek(seek_position, os.SEEK_CUR)

                    uncompressed_line = compressed_fd.readline()
                    expected_line = reference_fd.readline()
                    self.assertEqual(uncompressed_line, expected_line)

                    reference_position = reference_fd.tell()
                    uncompressed_position = compressed_fd.tell()
                    max_position = len(self.content)
                    reference_position = min(reference_position, max_position)
                    reference_fd.seek(reference_position, os.SEEK_SET)
                    self.assertEqual(uncompressed_position, reference_position)
Пример #14
0
    def test_seek_set(self):
        for compression_type in [
                CompressionTypes.BZIP2, CompressionTypes.GZIP
        ]:
            file_name = self._create_compressed_file(compression_type,
                                                     self.content)
            with open(file_name, 'rb') as f:
                compressed_fd = CompressedFile(f,
                                               compression_type,
                                               read_size=self.read_block_size)
                reference_fd = StringIO(self.content)

                # Note: content (readline) check must come before position (tell) check
                # because cStringIO's tell() reports out of bound positions (if we seek
                # beyond the file) up until a real read occurs.
                # _CompressedFile.tell() always stays within the bounds of the
                # uncompressed content.
                for seek_position in (-1, 0, 1, len(self.content) - 1,
                                      len(self.content),
                                      len(self.content) + 1):
                    compressed_fd.seek(seek_position, os.SEEK_SET)
                    reference_fd.seek(seek_position, os.SEEK_SET)

                    uncompressed_line = compressed_fd.readline()
                    reference_line = reference_fd.readline()
                    self.assertEqual(uncompressed_line, reference_line)

                    uncompressed_position = compressed_fd.tell()
                    reference_position = reference_fd.tell()
                    self.assertEqual(uncompressed_position, reference_position)
Пример #15
0
  def test_seek_cur(self):
    for compression_type in [CompressionTypes.BZIP2, CompressionTypes.GZIP]:
      file_name = self._create_compressed_file(compression_type, self.content)
      with open(file_name, 'rb') as f:
        compressed_fd = CompressedFile(f, compression_type,
                                       read_size=self.read_block_size)
        reference_fd = StringIO(self.content)

        # Test out of bound, inbound seeking in both directions
        for seek_position in (-1, 0, 1,
                              len(self.content) / 2,
                              len(self.content) / 2,
                              -1 * len(self.content) / 2):
          compressed_fd.seek(seek_position, os.SEEK_CUR)
          reference_fd.seek(seek_position, os.SEEK_CUR)

          uncompressed_line = compressed_fd.readline()
          expected_line = reference_fd.readline()
          self.assertEqual(uncompressed_line, expected_line)

          reference_position = reference_fd.tell()
          uncompressed_position = compressed_fd.tell()
          self.assertEqual(uncompressed_position, reference_position)
Пример #16
0
    def test_tell(self):
        lines = [b'line%d\n' % i for i in range(10)]
        tmpfile = self._create_temp_file()
        with open(tmpfile, 'wb') as f:
            writeable = CompressedFile(f)
            current_offset = 0
            for line in lines:
                writeable.write(line)
                current_offset += len(line)
                self.assertEqual(current_offset, writeable.tell())

        with open(tmpfile, 'rb') as f:
            readable = CompressedFile(f)
            current_offset = 0
            while True:
                line = readable.readline()
                current_offset += len(line)
                self.assertEqual(current_offset, readable.tell())
                if not line:
                    break
Пример #17
0
  def test_read_and_seek_back_to_beginning(self):
    for compression_type in [CompressionTypes.BZIP2, CompressionTypes.GZIP]:
      file_name = self._create_compressed_file(compression_type, self.content)
      with open(file_name, 'rb') as f:
        compressed_fd = CompressedFile(f, compression_type,
                                       read_size=self.read_block_size)

        first_pass = compressed_fd.readline()
        compressed_fd.seek(0, os.SEEK_SET)
        second_pass = compressed_fd.readline()

        self.assertEqual(first_pass, second_pass)
Пример #18
0
  def test_tell(self):
    lines = [b'line%d\n' % i for i in range(10)]
    tmpfile = self._create_temp_file()
    with open(tmpfile, 'wb') as f:
      writeable = CompressedFile(f)
      current_offset = 0
      for line in lines:
        writeable.write(line)
        current_offset += len(line)
        self.assertEqual(current_offset, writeable.tell())

    with open(tmpfile, 'rb') as f:
      readable = CompressedFile(f)
      current_offset = 0
      while True:
        line = readable.readline()
        current_offset += len(line)
        self.assertEqual(current_offset, readable.tell())
        if not line:
          break
Пример #19
0
  def test_seek_outside(self):
    for compression_type in [CompressionTypes.BZIP2, CompressionTypes.GZIP]:
      file_name = self._create_compressed_file(compression_type, self.content)
      with open(file_name, 'rb') as f:
        compressed_fd = CompressedFile(f, compression_type,
                                       read_size=self.read_block_size)

        for whence in (os.SEEK_CUR, os.SEEK_SET, os.SEEK_END):
          seek_position = -1 * len(self.content) - 10
          compressed_fd.seek(seek_position, whence)

          expected_position = 0
          uncompressed_position = compressed_fd.tell()
          self.assertEqual(uncompressed_position, expected_position)

          seek_position = len(self.content) + 20
          compressed_fd.seek(seek_position, whence)

          expected_position = len(self.content)
          uncompressed_position = compressed_fd.tell()
          self.assertEqual(uncompressed_position, expected_position)
Пример #20
0
 def test_seekable_enabled_on_read(self):
     readable = CompressedFile(open(self._create_temp_file(), 'r'))
     self.assertTrue(readable.seekable)
Пример #21
0
    def test_seekable(self):
        readable = CompressedFile(open(self._create_temp_file(), 'r'))
        self.assertFalse(readable.seekable)

        writeable = CompressedFile(open(self._create_temp_file(), 'w'))
        self.assertFalse(writeable.seekable)
Пример #22
0
    def test_concatenated_compressed_file(self):
        # The test apache_beam.io.textio_test.test_read_gzip_concat
        # does not encounter the problem in the Beam 2.13 and earlier
        # code base because the test data is too small: the data is
        # smaller than read_size, so it goes through logic in the code
        # that avoids the problem in the code.  So, this test sets
        # read_size smaller and test data bigger, in order to
        # encounter the problem. It would be difficult to test in the
        # textio_test module, because you'd need very large test data
        # because default read_size is 16MiB, and the ReadFromText
        # interface does not allow you to modify the read_size.
        import random
        import threading
        from six import int2byte
        num_test_lines = 10
        timeout = 30
        read_size = (64 << 10)  # set much smaller than the line size
        byte_table = tuple(int2byte(i) for i in range(32, 96))

        def generate_random_line():
            byte_list = list(b for i in range(4096)
                             for b in random.sample(byte_table, 64))
            byte_list.append(b'\n')
            return b''.join(byte_list)

        def create_test_file(compression_type, lines):
            filenames = list()
            file_name = self._create_temp_file()
            if compression_type == CompressionTypes.BZIP2:
                compress_factory = bz2.BZ2File
            elif compression_type == CompressionTypes.GZIP:
                compress_factory = gzip.open
            else:
                assert False, "Invalid compression type: %s" % compression_type
            for line in lines:
                filenames.append(self._create_temp_file())
                with compress_factory(filenames[-1], 'wb') as f:
                    f.write(line)
            with open(file_name, 'wb') as o:
                for name in filenames:
                    with open(name, 'rb') as i:
                        o.write(i.read())
            return file_name

        # I remember some time ago when a job ran with a real concatenated
        # gzip file, I got into an endless loop in the beam filesystem module.
        # That's why I put this handler in to trap an endless loop. However,
        # this unit test doesn't encounter an endless loop, it encounters a
        # different error, in the Beam 2.13 and earlier implementation.
        # So it's not strictly necessary to have this handler in this unit test.

        def timeout_handler():
            raise IOError('Exiting due to likley infinite loop logic in code.')

        timer = threading.Timer(timeout, timeout_handler)
        try:
            test_lines = tuple(generate_random_line()
                               for i in range(num_test_lines))
            for compression_type in [
                    CompressionTypes.BZIP2, CompressionTypes.GZIP
            ]:
                file_name = create_test_file(compression_type, test_lines)
                timer.start()
                with open(file_name, 'rb') as f:
                    data = CompressedFile(f,
                                          compression_type,
                                          read_size=read_size)
                    for written_line in test_lines:
                        read_line = data.readline()
                        self.assertEqual(written_line, read_line)
                timer.cancel()
                # Starting a new timer for the next iteration/test.
                timer = threading.Timer(timeout, timeout_handler)
        finally:
            timer.cancel()
Пример #23
0
 def test_seekable_disabled_on_write(self):
     writeable = CompressedFile(open(self._create_temp_file(), 'w'))
     self.assertFalse(writeable.seekable)
Пример #24
0
 def test_seekable_enabled_on_read(self):
     with open(self._create_temp_file(), 'rb') as f:
         readable = CompressedFile(f)
         self.assertTrue(readable.seekable)
Пример #25
0
 def test_seekable_disabled_on_append(self):
     with open(self._create_temp_file(), 'ab') as f:
         writeable = CompressedFile(f)
         self.assertFalse(writeable.seekable)