def test_dynamically_emptied_directories(tmpdir): """Ensure empty directories in the base backup are created Particularly in the case when PostgreSQL empties the files in those directories in parallel. This emptying can happen after the files are partitioned into their tarballs but before the tar and upload process is complete. """ # Create a directory structure with a file in it, e.g: # # ./adir/bdir/afile adir = tmpdir.join('adir').ensure(dir=True) bdir = adir.join('bdir').ensure(dir=True) some_file = bdir.join('afile') some_file.write('1234567890') # Generate the partition for the tar files base_dir = adir.strpath spec, parts = tar_partition.partition(base_dir) tar_paths = [] for part in parts: for tar_info in part: rel_path = os.path.relpath(tar_info.submitted_path, base_dir) tar_paths.append(rel_path) # Ensure the "bdir" directory is included in the partition so # "bdir" is created even if postgres removes "afile" during the # tarring process. assert 'bdir' in tar_paths
def test_dynamically_emptied_directories(tmpdir): ''' Ensure we create directories in the base backup even when PostgreSQL empties the files in those directories. This emptying can happen after we partition the files into their tarballs but before the tar and upload process is complete. ''' # Create a directory structure with a file in it: # adir/bdir/afile adir = tmpdir.join('adir').ensure(dir=True) bdir = adir.join('bdir').ensure(dir=True) some_file = bdir.join('afile') some_file.write('1234567890') # Generate the partition for the tar files base_dir = adir.strpath spec, parts = tar_partition.partition(base_dir) tar_paths = [] for part in parts: for tar_info in part: rel_path = os.path.relpath(tar_info.submitted_path, base_dir) tar_paths.append(rel_path) # Ensure we include the bdir directory in the partition so we'll still # create bdir even if postgres removes afile during the tar process assert 'bdir' in tar_paths
def _upload_pg_cluster_dir(self, start_backup_info, pg_cluster_dir, version, pool_size, rate_limit=None): """ Upload to url_prefix from pg_cluster_dir This function ignores the directory pg_xlog, which contains WAL files and are not generally part of a base backup. Note that this is also lzo compresses the files: thus, the number of pooled processes involves doing a full sequential scan of the uncompressed Postgres heap file that is pipelined into lzo. Once lzo is completely finished (necessary to have access to the file size) the file is sent to S3 or WABS. TODO: Investigate an optimization to decouple the compression and upload steps to make sure that the most efficient possible use of pipelining of network and disk resources occurs. Right now it possible to bounce back and forth between bottlenecking on reading from the database block device and subsequently the S3/WABS sending steps should the processes be at the same stage of the upload pipeline: this can have a very negative impact on being able to make full use of system resources. Furthermore, it desirable to overflowing the page cache: having separate tunables for number of simultanious compression jobs (which occupy /tmp space and page cache) and number of uploads (which affect upload throughput) would help. """ spec, parts = tar_partition.partition(pg_cluster_dir) # TODO :: Move arbitray path construction to StorageLayout Object backup_prefix = '{0}/basebackups_{1}/base_{file_name}_{file_offset}'\ .format(self.layout.prefix.rstrip('/'), FILE_STRUCTURE_VERSION, **start_backup_info) if rate_limit is None: per_process_limit = None else: per_process_limit = int(rate_limit / pool_size) # Reject tiny per-process rate limits. They should be # rejected more nicely elsewhere. assert per_process_limit > 0 or per_process_limit is None total_size = 0 # Make an attempt to upload extended version metadata extended_version_url = backup_prefix + '/extended_version.txt' logger.info( msg='start upload postgres version metadata', detail=('Uploading to {extended_version_url}.' .format(extended_version_url=extended_version_url))) uri_put_file(self.creds, extended_version_url, StringIO(version), content_encoding='text/plain') logger.info(msg='postgres version metadata upload complete') uploader = PartitionUploader(self.creds, backup_prefix, per_process_limit, self.gpg_key_id) pool = TarUploadPool(uploader, pool_size) # Enqueue uploads for parallel execution for tpart in parts: total_size += tpart.total_member_size # 'put' can raise an exception for a just-failed upload, # aborting the process. pool.put(tpart) # Wait for remaining parts to upload. An exception can be # raised to signal failure of the upload. pool.join() return spec, backup_prefix, total_size
def _upload_pg_cluster_dir(self, start_backup_info, pg_cluster_dir, version, pool_size, rate_limit=None): """ Upload to url_prefix from pg_cluster_dir This function ignores the directory pg_xlog, which contains WAL files and are not generally part of a base backup. Note that this is also lzo compresses the files: thus, the number of pooled processes involves doing a full sequential scan of the uncompressed Postgres heap file that is pipelined into lzo. Once lzo is completely finished (necessary to have access to the file size) the file is sent to S3 or WABS. TODO: Investigate an optimization to decouple the compression and upload steps to make sure that the most efficient possible use of pipelining of network and disk resources occurs. Right now it possible to bounce back and forth between bottlenecking on reading from the database block device and subsequently the S3/WABS sending steps should the processes be at the same stage of the upload pipeline: this can have a very negative impact on being able to make full use of system resources. Furthermore, it desirable to overflowing the page cache: having separate tunables for number of simultanious compression jobs (which occupy /tmp space and page cache) and number of uploads (which affect upload throughput) would help. """ spec, parts = tar_partition.partition(pg_cluster_dir) # TODO :: Move arbitray path construction to StorageLayout Object backup_prefix = '{0}/basebackups_{1}/base_{file_name}_{file_offset}'\ .format(self.layout.prefix.rstrip('/'), FILE_STRUCTURE_VERSION, **start_backup_info) if rate_limit is None: per_process_limit = None else: per_process_limit = int(rate_limit / pool_size) # Reject tiny per-process rate limits. They should be # rejected more nicely elsewhere. assert per_process_limit is None or per_process_limit > 0 total_size = 0 # Make an attempt to upload extended version metadata extended_version_url = backup_prefix + '/extended_version.txt' logger.info(msg='start upload postgres version metadata', detail=('Uploading to {extended_version_url}.'.format( extended_version_url=extended_version_url))) uri_put_file(self.creds, extended_version_url, BytesIO(version.encode("utf8")), content_type='text/plain') logger.info(msg='postgres version metadata upload complete') uploader = PartitionUploader(self.creds, backup_prefix, per_process_limit, self.gpg_key_id) pool = TarUploadPool(uploader, pool_size) # Enqueue uploads for parallel execution for tpart in parts: total_size += tpart.total_member_size # 'put' can raise an exception for a just-failed upload, # aborting the process. pool.put(tpart) # Wait for remaining parts to upload. An exception can be # raised to signal failure of the upload. pool.join() return spec, backup_prefix, total_size
def _s3_upload_pg_cluster_dir(self, start_backup_info, pg_cluster_dir, version, pool_size, rate_limit=None): """ Upload to s3_url_prefix from pg_cluster_dir This function ignores the directory pg_xlog, which contains WAL files and are not generally part of a base backup. Note that this is also lzo compresses the files: thus, the number of pooled processes involves doing a full sequential scan of the uncompressed Postgres heap file that is pipelined into lzo. Once lzo is completely finished (necessary to have access to the file size) the file is sent to S3. TODO: Investigate an optimization to decouple the compression and upload steps to make sure that the most efficient possible use of pipelining of network and disk resources occurs. Right now it possible to bounce back and forth between bottlenecking on reading from the database block device and subsequently the S3 sending steps should the processes be at the same stage of the upload pipeline: this can have a very negative impact on being able to make full use of system resources. Furthermore, it desirable to overflowing the page cache: having separate tunables for number of simultanious compression jobs (which occupy /tmp space and page cache) and number of uploads (which affect upload throughput) would help. """ parts = tar_partition.partition(pg_cluster_dir) backup_s3_prefix = ('{0}/basebackups_{1}/' 'base_{file_name}_{file_offset}' .format(self.s3_prefix, FILE_STRUCTURE_VERSION, **start_backup_info)) if rate_limit is None: per_process_limit = None else: per_process_limit = int(rate_limit / pool_size) # Reject tiny per-process rate limits. They should be # rejected more nicely elsewhere. assert per_process_limit > 0 or per_process_limit is None # a list to accumulate async upload jobs uploads = [] total_size = 0 # Make an attempt to upload extended version metadata extended_version_url = backup_s3_prefix + '/extended_version.txt' logger.info( msg='start upload postgres version metadata', detail=('Uploading to {extended_version_url}.' .format(extended_version_url=extended_version_url))) s3_worker.uri_put_file(extended_version_url, StringIO(version), content_encoding='text/plain') logger.info(msg='postgres version metadata upload complete') pool = gevent.pool.Pool(size=pool_size) # Enqueue uploads for parallel execution try: for tpart in parts: total_size += tpart.total_member_size uploads.append(pool.apply_async( s3_worker.do_partition_put, [backup_s3_prefix, tpart, per_process_limit, self.gpg_key_id])) finally: while uploads: uploads.pop().get() pool.join() return backup_s3_prefix, total_size