def test_gcs_path_object_optional(self): self.assertEqual( gcsio.parse_gcs_path('gs://bucket/name', object_optional=True), ('bucket', 'name')) self.assertEqual( gcsio.parse_gcs_path('gs://bucket/', object_optional=True), ('bucket', ''))
def test_gcs_path_object_optional(self): self.assertEqual( gcsio.parse_gcs_path('gs://bucket/name', object_optional=True), ('bucket', 'name')) self.assertEqual( gcsio.parse_gcs_path('gs://bucket/', object_optional=True), ('bucket', ''))
def test_copytree(self): src_dir_name = 'gs://gcsio-test/source/' dest_dir_name = 'gs://gcsio-test/dest/' file_size = 1024 paths = ['a', 'b/c', 'b/d'] for path in paths: src_file_name = src_dir_name + path dest_file_name = dest_dir_name + path self._insert_random_file(self.client, src_file_name, file_size) self.assertTrue( gcsio.parse_gcs_path(src_file_name) in self.client.objects.files) self.assertFalse( gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files) self.gcs.copytree(src_dir_name, dest_dir_name) for path in paths: src_file_name = src_dir_name + path dest_file_name = dest_dir_name + path self.assertTrue( gcsio.parse_gcs_path(src_file_name) in self.client.objects.files) self.assertTrue( gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files)
def test_delete(self): file_name = 'gs://gcsio-test/delete_me' file_size = 1024 # Test deletion of non-existent file. self.gcs.delete(file_name) self._insert_random_file(self.client, file_name, file_size) self.assertTrue( gcsio.parse_gcs_path(file_name) in self.client.objects.files) self.gcs.delete(file_name) self.assertFalse( gcsio.parse_gcs_path(file_name) in self.client.objects.files)
def test_rename(self): src_file_name = 'gs://gcsio-test/source' dest_file_name = 'gs://gcsio-test/dest' file_size = 1024 self._insert_random_file(self.client, src_file_name, file_size) self.assertTrue( gcsio.parse_gcs_path(src_file_name) in self.client.objects.files) self.assertFalse( gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files) self.gcs.rename(src_file_name, dest_file_name) self.assertFalse( gcsio.parse_gcs_path(src_file_name) in self.client.objects.files) self.assertTrue( gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files)
def _insert_random_file(self, client, path, size, generation=1, crc32c=None, last_updated=None): bucket, name = gcsio.parse_gcs_path(path) f = FakeFile(bucket, name, os.urandom(size), generation, crc32c=crc32c, last_updated=last_updated) client.objects.add_file(f) return f
def test_copy(self): src_file_name = 'gs://gcsio-test/source' dest_file_name = 'gs://gcsio-test/dest' file_size = 1024 self._insert_random_file(self.client, src_file_name, file_size) self.assertTrue( gcsio.parse_gcs_path(src_file_name) in self.client.objects.files) self.assertFalse( gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files) self.gcs.copy(src_file_name, dest_file_name) self.assertTrue( gcsio.parse_gcs_path(src_file_name) in self.client.objects.files) self.assertTrue( gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files) self.assertRaises(IOError, self.gcs.copy, 'gs://gcsio-test/non-existent', 'gs://gcsio-test/non-existent-destination')
def test_copy(self): src_file_name = 'gs://gcsio-test/source' dest_file_name = 'gs://gcsio-test/dest' file_size = 1024 self._insert_random_file(self.client, src_file_name, file_size) self.assertTrue( gcsio.parse_gcs_path(src_file_name) in self.client.objects.files) self.assertFalse( gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files) self.gcs.copy(src_file_name, dest_file_name) self.assertTrue( gcsio.parse_gcs_path(src_file_name) in self.client.objects.files) self.assertTrue( gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files) self.assertRaises(IOError, self.gcs.copy, 'gs://gcsio-test/non-existent', 'gs://gcsio-test/non-existent-destination')
def compose_vcf_shards( project, # type: str vcf_data_header_file_path, # type: str vcf_data_files_folder, # type: str output_file, # type: str delete=True, # type: bool ): # type: (...) -> None """Composes VCF shards to one VCF file. It composes VCF data header and VCF data files to one VCF file, and deletes the original VCF shards if `delete` is True. TODO(allieychen): Eventually, it further consolidates the meta information, into the `output_file`. Args: project: The project name. vcf_data_header_file_path: The path of the VCF data header file. vcf_data_files_folder: The folder that contains all VCF data files. output_file: The final VCF file path. delete: If true, delete the original VCF shards. """ vcf_data_bucket_name, vcf_data_blob_prefix = gcsio.parse_gcs_path( vcf_data_files_folder) vcf_data_header_bucket_name, vcf_data_header_blob_name = gcsio.parse_gcs_path( vcf_data_header_file_path) if vcf_data_bucket_name != vcf_data_header_bucket_name: raise ValueError( 'The VCF data files {} and data header file {} are in ' 'different buckets. '.format(vcf_data_files_folder, vcf_data_header_file_path)) composed_vcf_data_blob = _compose_vcf_data_files(project, vcf_data_files_folder) client = storage.Client(project) bucket = client.get_bucket(vcf_data_bucket_name) output_file_blob = _create_blob(client, output_file) output_file_blob.compose( [bucket.get_blob(vcf_data_header_blob_name), composed_vcf_data_blob]) if delete: bucket.delete_blobs(bucket.list_blobs(prefix=vcf_data_blob_prefix)) bucket.delete_blobs( bucket.list_blobs(prefix=vcf_data_header_blob_name))
def compose_gcs_vcf_shards( project, # type: str vcf_header_file_path, # type: str vcf_data_files_folder, # type: str output_file, # type: str delete=False, # type: bool ): # type: (...) -> None """Composes VCF shards in GCS to one VCF file. It composes VCF header and VCF data files to one VCF file, and deletes the original VCF shards if `delete` is True. Args: project: The project name. vcf_header_file_path: The path of the VCF header file, it contains the meta information, as well as the data header line with the call names. vcf_data_files_folder: The folder that contains all VCF data files. output_file: The final VCF file path. delete: If true, delete the original VCF shards. """ header_bucket_name, header_blob = gcsio.parse_gcs_path( vcf_header_file_path) vcf_data_bucket_name, vcf_data_blob_prefix = gcsio.parse_gcs_path( vcf_data_files_folder) if vcf_data_bucket_name != header_bucket_name: raise ValueError('The VCF data files {} and header file {} are in ' 'different buckets. '.format(vcf_data_files_folder, vcf_header_file_path)) composed_vcf_data_blob = _compose_vcf_data_files(project, vcf_data_files_folder) client = storage.Client(project) bucket = client.get_bucket(vcf_data_bucket_name) output_file_blob = _create_blob(client, output_file) output_file_blob.compose( [bucket.get_blob(header_blob), composed_vcf_data_blob]) if delete: bucket.delete_blobs(bucket.list_blobs(prefix=vcf_data_blob_prefix)) bucket.delete_blobs(bucket.list_blobs(prefix=header_blob))
def test_file_close(self): file_name = 'gs://gcsio-test/close_file' file_size = 5 * 1024 * 1024 + 2000 contents = os.urandom(file_size) f = self.gcs.open(file_name, 'w') self.assertEqual(f.mode, 'w') f.write(contents) f.close() f.close() # This should not crash. bucket, name = gcsio.parse_gcs_path(file_name) self.assertEqual( self.client.objects.get_file(bucket, name).contents, contents)
def _compose_vcf_data_files(project, vcf_data_files_folder): # type: (str, str) -> storage.Blob """Composes multiple VCF data files to one VCF data file. Args: project: The project name. vcf_data_files_folder: The folder that contains all VCF data files. """ bucket_name, blob_prefix = gcsio.parse_gcs_path(vcf_data_files_folder) multi_process_composer = MultiProcessComposer(project, bucket_name, blob_prefix) return multi_process_composer.get_composed_blob()
def test_copy(self): src_file_name = 'gs://gcsio-test/source' dest_file_name = 'gs://gcsio-test/dest' file_size = 1024 self._insert_random_file(self.client, src_file_name, file_size) self.assertTrue( gcsio.parse_gcs_path(src_file_name) in self.client.objects.files) self.assertFalse( gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files) self.gcs.copy(src_file_name, dest_file_name, dest_kms_key_name='kms_key') self.assertTrue( gcsio.parse_gcs_path(src_file_name) in self.client.objects.files) self.assertTrue( gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files) # Test copy of non-existent files. with self.assertRaisesRegexp(HttpError, r'Not Found'): self.gcs.copy('gs://gcsio-test/non-existent', 'gs://gcsio-test/non-existent-destination')
def test_file_write(self): file_name = 'gs://gcsio-test/write_file' file_size = 5 * 1024 * 1024 + 2000 contents = os.urandom(file_size) f = self.gcs.open(file_name, 'w') self.assertEqual(f.mode, 'w') f.write(contents[0:1000]) f.write(contents[1000:1024 * 1024]) f.write(contents[1024 * 1024:]) f.close() bucket, name = gcsio.parse_gcs_path(file_name) self.assertEqual( self.client.objects.get_file(bucket, name).contents, contents)
def test_file_read_line(self): file_name = 'gs://gcsio-test/read_line_file' lines = [] # Set a small buffer size to exercise refilling the buffer. # First line is carefully crafted so the newline falls as the last character # of the buffer to exercise this code path. read_buffer_size = 1024 lines.append('x' * 1023 + '\n') for _ in range(1, 1000): line_length = random.randint(100, 500) line = os.urandom(line_length).replace('\n', ' ') + '\n' lines.append(line) contents = ''.join(lines) file_size = len(contents) bucket, name = gcsio.parse_gcs_path(file_name) self.client.objects.add_file(FakeFile(bucket, name, contents, 1)) f = self.gcs.open(file_name, read_buffer_size=read_buffer_size) # Test read of first two lines. f.seek(0) self.assertEqual(f.readline(), lines[0]) self.assertEqual(f.tell(), len(lines[0])) self.assertEqual(f.readline(), lines[1]) # Test read at line boundary. f.seek(file_size - len(lines[-1]) - 1) self.assertEqual(f.readline(), '\n') # Test read at end of file. f.seek(file_size) self.assertEqual(f.readline(), '') # Test reads at random positions. random.seed(0) for _ in range(0, 10): start = random.randint(0, file_size - 1) line_index = 0 # Find line corresponding to start index. chars_left = start while True: next_line_length = len(lines[line_index]) if chars_left - next_line_length < 0: break chars_left -= next_line_length line_index += 1 f.seek(start) self.assertEqual(f.readline(), lines[line_index][chars_left:])
def test_copytree(self): src_dir_name = 'gs://gcsio-test/source/' dest_dir_name = 'gs://gcsio-test/dest/' file_size = 1024 paths = ['a', 'b/c', 'b/d'] for path in paths: src_file_name = src_dir_name + path dest_file_name = dest_dir_name + path self._insert_random_file(self.client, src_file_name, file_size) self.assertTrue( gcsio.parse_gcs_path(src_file_name) in self.client.objects.files) self.assertFalse( gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files) self.gcs.copytree(src_dir_name, dest_dir_name) for path in paths: src_file_name = src_dir_name + path dest_file_name = dest_dir_name + path self.assertTrue( gcsio.parse_gcs_path(src_file_name) in self.client.objects.files) self.assertTrue( gcsio.parse_gcs_path(dest_file_name) in self.client.objects.files)
def test_file_flush(self): file_name = 'gs://gcsio-test/flush_file' file_size = 5 * 1024 * 1024 + 2000 contents = os.urandom(file_size) bucket, name = gcsio.parse_gcs_path(file_name) f = self.gcs.open(file_name, 'w') self.assertEqual(f.mode, 'w') f.write(contents[0:1000]) f.flush() f.write(contents[1000:1024 * 1024]) f.flush() f.flush() # Should be a NOOP. f.write(contents[1024 * 1024:]) f.close() # This should already call the equivalent of flush() in its body. self.assertEqual( self.client.objects.get_file(bucket, name).contents, contents)
def _insert_random_file(self, client, path, size, generation=1, crc32c=None, last_updated=None, fail_when_getting_metadata=False, fail_when_reading=False): bucket, name = gcsio.parse_gcs_path(path) f = FakeFile(bucket, name, os.urandom(size), generation, crc32c=crc32c, last_updated=last_updated) client.objects.add_file(f, fail_when_getting_metadata, fail_when_reading) return f
def test_context_manager(self): # Test writing with a context manager. file_name = 'gs://gcsio-test/context_manager_file' file_size = 1024 contents = os.urandom(file_size) with self.gcs.open(file_name, 'w') as f: f.write(contents) bucket, name = gcsio.parse_gcs_path(file_name) self.assertEqual( self.client.objects.get_file(bucket, name).contents, contents) # Test reading with a context manager. with self.gcs.open(file_name) as f: self.assertEqual(f.read(), contents) # Test that exceptions are not swallowed by the context manager. with self.assertRaises(ZeroDivisionError): with self.gcs.open(file_name) as f: f.read(0 // 0)
def test_file_iterator(self): file_name = 'gs://gcsio-test/iterating_file' lines = [] line_count = 10 for _ in range(line_count): line_length = random.randint(100, 500) line = os.urandom(line_length).replace('\n', ' ') + '\n' lines.append(line) contents = ''.join(lines) bucket, name = gcsio.parse_gcs_path(file_name) self.client.objects.add_file(FakeFile(bucket, name, contents, 1)) f = self.gcs.open(file_name) read_lines = 0 for line in f: read_lines += 1 self.assertEqual(read_lines, line_count)
def _insert_random_file(self, client, path, size, generation=1): bucket, name = gcsio.parse_gcs_path(path) f = FakeFile(bucket, name, os.urandom(size), generation) client.objects.add_file(f) return f
def _insert_random_file(self, client, path, size, generation=1): bucket, name = gcsio.parse_gcs_path(path) f = FakeFile(bucket, name, os.urandom(size), generation) client.objects.add_file(f) return f
def _create_blob(client, file_path): # type: (storage.Client, str) -> storage.Blob bucket_name, blob_name = gcsio.parse_gcs_path(file_path) file_blob = client.get_bucket(bucket_name).blob(blob_name) file_blob.content_type = 'text/plain' return file_blob
def test_gcs_path(self): self.assertEqual( gcsio.parse_gcs_path('gs://bucket/name'), ('bucket', 'name')) self.assertEqual( gcsio.parse_gcs_path('gs://bucket/name/sub'), ('bucket', 'name/sub'))