示例#1
0
 def test_gcs_path_object_optional(self):
   self.assertEqual(
       gcsio.parse_gcs_path('gs://bucket/name', object_optional=True),
       ('bucket', 'name'))
   self.assertEqual(
       gcsio.parse_gcs_path('gs://bucket/', object_optional=True),
       ('bucket', ''))
示例#2
0
 def test_gcs_path_object_optional(self):
     self.assertEqual(
         gcsio.parse_gcs_path('gs://bucket/name', object_optional=True),
         ('bucket', 'name'))
     self.assertEqual(
         gcsio.parse_gcs_path('gs://bucket/', object_optional=True),
         ('bucket', ''))
示例#3
0
    def test_copytree(self):
        src_dir_name = 'gs://gcsio-test/source/'
        dest_dir_name = 'gs://gcsio-test/dest/'
        file_size = 1024
        paths = ['a', 'b/c', 'b/d']
        for path in paths:
            src_file_name = src_dir_name + path
            dest_file_name = dest_dir_name + path
            self._insert_random_file(self.client, src_file_name, file_size)
            self.assertTrue(
                gcsio.parse_gcs_path(src_file_name) in
                self.client.objects.files)
            self.assertFalse(
                gcsio.parse_gcs_path(dest_file_name) in
                self.client.objects.files)

        self.gcs.copytree(src_dir_name, dest_dir_name)

        for path in paths:
            src_file_name = src_dir_name + path
            dest_file_name = dest_dir_name + path
            self.assertTrue(
                gcsio.parse_gcs_path(src_file_name) in
                self.client.objects.files)
            self.assertTrue(
                gcsio.parse_gcs_path(dest_file_name) in
                self.client.objects.files)
示例#4
0
  def test_delete(self):
    file_name = 'gs://gcsio-test/delete_me'
    file_size = 1024

    # Test deletion of non-existent file.
    self.gcs.delete(file_name)

    self._insert_random_file(self.client, file_name, file_size)
    self.assertTrue(
        gcsio.parse_gcs_path(file_name) in self.client.objects.files)

    self.gcs.delete(file_name)

    self.assertFalse(
        gcsio.parse_gcs_path(file_name) in self.client.objects.files)
示例#5
0
  def test_rename(self):
    src_file_name = 'gs://gcsio-test/source'
    dest_file_name = 'gs://gcsio-test/dest'
    file_size = 1024
    self._insert_random_file(self.client, src_file_name, file_size)
    self.assertTrue(
        gcsio.parse_gcs_path(src_file_name) in self.client.objects.files)
    self.assertFalse(
        gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files)

    self.gcs.rename(src_file_name, dest_file_name)

    self.assertFalse(
        gcsio.parse_gcs_path(src_file_name) in self.client.objects.files)
    self.assertTrue(
        gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files)
示例#6
0
 def _insert_random_file(self, client, path, size, generation=1, crc32c=None,
                         last_updated=None):
   bucket, name = gcsio.parse_gcs_path(path)
   f = FakeFile(bucket, name, os.urandom(size), generation, crc32c=crc32c,
                last_updated=last_updated)
   client.objects.add_file(f)
   return f
示例#7
0
  def test_copy(self):
    src_file_name = 'gs://gcsio-test/source'
    dest_file_name = 'gs://gcsio-test/dest'
    file_size = 1024
    self._insert_random_file(self.client, src_file_name, file_size)
    self.assertTrue(
        gcsio.parse_gcs_path(src_file_name) in self.client.objects.files)
    self.assertFalse(
        gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files)

    self.gcs.copy(src_file_name, dest_file_name)

    self.assertTrue(
        gcsio.parse_gcs_path(src_file_name) in self.client.objects.files)
    self.assertTrue(
        gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files)

    self.assertRaises(IOError, self.gcs.copy, 'gs://gcsio-test/non-existent',
                      'gs://gcsio-test/non-existent-destination')
示例#8
0
  def test_copy(self):
    src_file_name = 'gs://gcsio-test/source'
    dest_file_name = 'gs://gcsio-test/dest'
    file_size = 1024
    self._insert_random_file(self.client, src_file_name, file_size)
    self.assertTrue(
        gcsio.parse_gcs_path(src_file_name) in self.client.objects.files)
    self.assertFalse(
        gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files)

    self.gcs.copy(src_file_name, dest_file_name)

    self.assertTrue(
        gcsio.parse_gcs_path(src_file_name) in self.client.objects.files)
    self.assertTrue(
        gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files)

    self.assertRaises(IOError, self.gcs.copy, 'gs://gcsio-test/non-existent',
                      'gs://gcsio-test/non-existent-destination')
示例#9
0
def compose_vcf_shards(
        project,  # type: str
        vcf_data_header_file_path,  # type: str
        vcf_data_files_folder,  # type: str
        output_file,  # type: str
        delete=True,  # type: bool
):
    # type: (...) -> None
    """Composes VCF shards to one VCF file.

  It composes VCF data header and VCF data files to one VCF file, and deletes
  the original VCF shards if `delete` is True.
  TODO(allieychen): Eventually, it further consolidates the meta information,
  into the `output_file`.

  Args:
    project: The project name.
    vcf_data_header_file_path: The path of the VCF data header file.
    vcf_data_files_folder: The folder that contains all VCF data files.
    output_file: The final VCF file path.
    delete: If true, delete the original VCF shards.
  """
    vcf_data_bucket_name, vcf_data_blob_prefix = gcsio.parse_gcs_path(
        vcf_data_files_folder)
    vcf_data_header_bucket_name, vcf_data_header_blob_name = gcsio.parse_gcs_path(
        vcf_data_header_file_path)
    if vcf_data_bucket_name != vcf_data_header_bucket_name:
        raise ValueError(
            'The VCF data files {} and data header file {} are in '
            'different buckets. '.format(vcf_data_files_folder,
                                         vcf_data_header_file_path))

    composed_vcf_data_blob = _compose_vcf_data_files(project,
                                                     vcf_data_files_folder)
    client = storage.Client(project)
    bucket = client.get_bucket(vcf_data_bucket_name)
    output_file_blob = _create_blob(client, output_file)
    output_file_blob.compose(
        [bucket.get_blob(vcf_data_header_blob_name), composed_vcf_data_blob])
    if delete:
        bucket.delete_blobs(bucket.list_blobs(prefix=vcf_data_blob_prefix))
        bucket.delete_blobs(
            bucket.list_blobs(prefix=vcf_data_header_blob_name))
def compose_gcs_vcf_shards(
        project,  # type: str
        vcf_header_file_path,  # type: str
        vcf_data_files_folder,  # type: str
        output_file,  # type: str
        delete=False,  # type: bool
):
    # type: (...) -> None
    """Composes VCF shards in GCS to one VCF file.

  It composes VCF header and VCF data files to one VCF file, and deletes the
  original VCF shards if `delete` is True.

  Args:
    project: The project name.
    vcf_header_file_path: The path of the VCF header file, it contains the meta
      information, as well as the data header line with the call names.
    vcf_data_files_folder: The folder that contains all VCF data files.
    output_file: The final VCF file path.
    delete: If true, delete the original VCF shards.
  """
    header_bucket_name, header_blob = gcsio.parse_gcs_path(
        vcf_header_file_path)
    vcf_data_bucket_name, vcf_data_blob_prefix = gcsio.parse_gcs_path(
        vcf_data_files_folder)

    if vcf_data_bucket_name != header_bucket_name:
        raise ValueError('The VCF data files {} and header file {} are in '
                         'different buckets. '.format(vcf_data_files_folder,
                                                      vcf_header_file_path))

    composed_vcf_data_blob = _compose_vcf_data_files(project,
                                                     vcf_data_files_folder)
    client = storage.Client(project)
    bucket = client.get_bucket(vcf_data_bucket_name)
    output_file_blob = _create_blob(client, output_file)
    output_file_blob.compose(
        [bucket.get_blob(header_blob), composed_vcf_data_blob])
    if delete:
        bucket.delete_blobs(bucket.list_blobs(prefix=vcf_data_blob_prefix))
        bucket.delete_blobs(bucket.list_blobs(prefix=header_blob))
示例#11
0
 def test_file_close(self):
   file_name = 'gs://gcsio-test/close_file'
   file_size = 5 * 1024 * 1024 + 2000
   contents = os.urandom(file_size)
   f = self.gcs.open(file_name, 'w')
   self.assertEqual(f.mode, 'w')
   f.write(contents)
   f.close()
   f.close()  # This should not crash.
   bucket, name = gcsio.parse_gcs_path(file_name)
   self.assertEqual(
       self.client.objects.get_file(bucket, name).contents, contents)
def _compose_vcf_data_files(project, vcf_data_files_folder):
    # type: (str, str) -> storage.Blob
    """Composes multiple VCF data files to one VCF data file.

  Args:
    project: The project name.
    vcf_data_files_folder: The folder that contains all VCF data files.
  """
    bucket_name, blob_prefix = gcsio.parse_gcs_path(vcf_data_files_folder)
    multi_process_composer = MultiProcessComposer(project, bucket_name,
                                                  blob_prefix)
    return multi_process_composer.get_composed_blob()
示例#13
0
  def test_copy(self):
    src_file_name = 'gs://gcsio-test/source'
    dest_file_name = 'gs://gcsio-test/dest'
    file_size = 1024
    self._insert_random_file(self.client, src_file_name, file_size)
    self.assertTrue(
        gcsio.parse_gcs_path(src_file_name) in self.client.objects.files)
    self.assertFalse(
        gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files)

    self.gcs.copy(src_file_name, dest_file_name, dest_kms_key_name='kms_key')

    self.assertTrue(
        gcsio.parse_gcs_path(src_file_name) in self.client.objects.files)
    self.assertTrue(
        gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files)

    # Test copy of non-existent files.
    with self.assertRaisesRegexp(HttpError, r'Not Found'):
      self.gcs.copy('gs://gcsio-test/non-existent',
                    'gs://gcsio-test/non-existent-destination')
示例#14
0
 def test_file_write(self):
   file_name = 'gs://gcsio-test/write_file'
   file_size = 5 * 1024 * 1024 + 2000
   contents = os.urandom(file_size)
   f = self.gcs.open(file_name, 'w')
   self.assertEqual(f.mode, 'w')
   f.write(contents[0:1000])
   f.write(contents[1000:1024 * 1024])
   f.write(contents[1024 * 1024:])
   f.close()
   bucket, name = gcsio.parse_gcs_path(file_name)
   self.assertEqual(
       self.client.objects.get_file(bucket, name).contents, contents)
示例#15
0
  def test_file_read_line(self):
    file_name = 'gs://gcsio-test/read_line_file'
    lines = []

    # Set a small buffer size to exercise refilling the buffer.
    # First line is carefully crafted so the newline falls as the last character
    # of the buffer to exercise this code path.
    read_buffer_size = 1024
    lines.append('x' * 1023 + '\n')

    for _ in range(1, 1000):
      line_length = random.randint(100, 500)
      line = os.urandom(line_length).replace('\n', ' ') + '\n'
      lines.append(line)
    contents = ''.join(lines)

    file_size = len(contents)
    bucket, name = gcsio.parse_gcs_path(file_name)
    self.client.objects.add_file(FakeFile(bucket, name, contents, 1))

    f = self.gcs.open(file_name, read_buffer_size=read_buffer_size)

    # Test read of first two lines.
    f.seek(0)
    self.assertEqual(f.readline(), lines[0])
    self.assertEqual(f.tell(), len(lines[0]))
    self.assertEqual(f.readline(), lines[1])

    # Test read at line boundary.
    f.seek(file_size - len(lines[-1]) - 1)
    self.assertEqual(f.readline(), '\n')

    # Test read at end of file.
    f.seek(file_size)
    self.assertEqual(f.readline(), '')

    # Test reads at random positions.
    random.seed(0)
    for _ in range(0, 10):
      start = random.randint(0, file_size - 1)
      line_index = 0
      # Find line corresponding to start index.
      chars_left = start
      while True:
        next_line_length = len(lines[line_index])
        if chars_left - next_line_length < 0:
          break
        chars_left -= next_line_length
        line_index += 1
      f.seek(start)
      self.assertEqual(f.readline(), lines[line_index][chars_left:])
  def test_copytree(self):
    src_dir_name = 'gs://gcsio-test/source/'
    dest_dir_name = 'gs://gcsio-test/dest/'
    file_size = 1024
    paths = ['a', 'b/c', 'b/d']
    for path in paths:
      src_file_name = src_dir_name + path
      dest_file_name = dest_dir_name + path
      self._insert_random_file(self.client, src_file_name, file_size)
      self.assertTrue(
          gcsio.parse_gcs_path(src_file_name) in self.client.objects.files)
      self.assertFalse(
          gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files)

    self.gcs.copytree(src_dir_name, dest_dir_name)

    for path in paths:
      src_file_name = src_dir_name + path
      dest_file_name = dest_dir_name + path
      self.assertTrue(
          gcsio.parse_gcs_path(src_file_name) in self.client.objects.files)
      self.assertTrue(
          gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files)
示例#17
0
 def test_file_flush(self):
   file_name = 'gs://gcsio-test/flush_file'
   file_size = 5 * 1024 * 1024 + 2000
   contents = os.urandom(file_size)
   bucket, name = gcsio.parse_gcs_path(file_name)
   f = self.gcs.open(file_name, 'w')
   self.assertEqual(f.mode, 'w')
   f.write(contents[0:1000])
   f.flush()
   f.write(contents[1000:1024 * 1024])
   f.flush()
   f.flush()  # Should be a NOOP.
   f.write(contents[1024 * 1024:])
   f.close()  # This should already call the equivalent of flush() in its body.
   self.assertEqual(
       self.client.objects.get_file(bucket, name).contents, contents)
示例#18
0
 def _insert_random_file(self,
                         client,
                         path,
                         size,
                         generation=1,
                         crc32c=None,
                         last_updated=None,
                         fail_when_getting_metadata=False,
                         fail_when_reading=False):
     bucket, name = gcsio.parse_gcs_path(path)
     f = FakeFile(bucket,
                  name,
                  os.urandom(size),
                  generation,
                  crc32c=crc32c,
                  last_updated=last_updated)
     client.objects.add_file(f, fail_when_getting_metadata,
                             fail_when_reading)
     return f
示例#19
0
  def test_context_manager(self):
    # Test writing with a context manager.
    file_name = 'gs://gcsio-test/context_manager_file'
    file_size = 1024
    contents = os.urandom(file_size)
    with self.gcs.open(file_name, 'w') as f:
      f.write(contents)
    bucket, name = gcsio.parse_gcs_path(file_name)
    self.assertEqual(
        self.client.objects.get_file(bucket, name).contents, contents)

    # Test reading with a context manager.
    with self.gcs.open(file_name) as f:
      self.assertEqual(f.read(), contents)

    # Test that exceptions are not swallowed by the context manager.
    with self.assertRaises(ZeroDivisionError):
      with self.gcs.open(file_name) as f:
        f.read(0 // 0)
示例#20
0
  def test_file_iterator(self):
    file_name = 'gs://gcsio-test/iterating_file'
    lines = []
    line_count = 10
    for _ in range(line_count):
      line_length = random.randint(100, 500)
      line = os.urandom(line_length).replace('\n', ' ') + '\n'
      lines.append(line)

    contents = ''.join(lines)
    bucket, name = gcsio.parse_gcs_path(file_name)
    self.client.objects.add_file(FakeFile(bucket, name, contents, 1))

    f = self.gcs.open(file_name)

    read_lines = 0
    for line in f:
      read_lines += 1

    self.assertEqual(read_lines, line_count)
示例#21
0
 def _insert_random_file(self, client, path, size, generation=1):
   bucket, name = gcsio.parse_gcs_path(path)
   f = FakeFile(bucket, name, os.urandom(size), generation)
   client.objects.add_file(f)
   return f
示例#22
0
 def _insert_random_file(self, client, path, size, generation=1):
     bucket, name = gcsio.parse_gcs_path(path)
     f = FakeFile(bucket, name, os.urandom(size), generation)
     client.objects.add_file(f)
     return f
def _create_blob(client, file_path):
    # type: (storage.Client, str) -> storage.Blob
    bucket_name, blob_name = gcsio.parse_gcs_path(file_path)
    file_blob = client.get_bucket(bucket_name).blob(blob_name)
    file_blob.content_type = 'text/plain'
    return file_blob
示例#24
0
 def test_gcs_path(self):
   self.assertEqual(
       gcsio.parse_gcs_path('gs://bucket/name'), ('bucket', 'name'))
   self.assertEqual(
       gcsio.parse_gcs_path('gs://bucket/name/sub'), ('bucket', 'name/sub'))