def http_restore(pghoard): pgdata = get_pg_wal_directory( pghoard.config["backup_sites"][pghoard.test_site]) return HTTPRestore("localhost", pghoard.config["http_port"], site=pghoard.test_site, pgdata=pgdata)
def test_archive_command_with_invalid_file(self, pghoard): # only WAL and timeline (.history) files can be archived bl_label = "000000010000000000000002.00000028.backup" bl_file = "xlog/{}".format(bl_label) wal_path = os.path.join( get_pg_wal_directory( pghoard.config["backup_sites"][pghoard.test_site]), bl_label) backup_wal_path = os.path.join(pghoard.config["backup_location"], pghoard.test_site, bl_file) with open(wal_path, "w") as fp: fp.write("jee") # backup labels are ignored - archiving returns success but file won't appear on disk archive_command(host="127.0.0.1", port=pghoard.config["http_port"], site=pghoard.test_site, xlog=bl_label) assert not os.path.exists(backup_wal_path) # any other files raise an error with pytest.raises(postgres_command.PGCError) as excinfo: archive_command(host="127.0.0.1", port=pghoard.config["http_port"], site=pghoard.test_site, xlog=bl_label + ".x") assert excinfo.value.exit_code == postgres_command.EXIT_ARCHIVE_FAIL assert not os.path.exists(backup_wal_path + ".x")
def test_get_encrypted_archived_file(self, pghoard): wal_seg = "000000090000000000000010" content = wal_header_for_file(wal_seg) compressor = pghoard.Compressor() compressed_content = compressor.compress(content) + (compressor.flush() or b"") encryptor = Encryptor(CONSTANT_TEST_RSA_PUBLIC_KEY) encrypted_content = encryptor.update(compressed_content) + encryptor.finalize() wal_dir = get_pg_wal_directory(pghoard.config["backup_sites"][pghoard.test_site]) archive_path = os.path.join(pghoard.test_site, "xlog", wal_seg) metadata = { "compression-algorithm": pghoard.config["compression"]["algorithm"], "original-file-size": len(content), "encryption-key-id": "testkey", } store = pghoard.transfer_agents[0].get_object_storage(pghoard.test_site) store.store_file_from_memory(archive_path, encrypted_content, metadata=metadata) pghoard.webserver.config["backup_sites"][pghoard.test_site]["encryption_keys"] = { "testkey": { "public": CONSTANT_TEST_RSA_PUBLIC_KEY, "private": CONSTANT_TEST_RSA_PRIVATE_KEY, }, } restore_target = os.path.join(wal_dir, wal_seg) restore_command(site=pghoard.test_site, xlog=wal_seg, output=restore_target, host="127.0.0.1", port=pghoard.config["http_port"]) assert os.path.exists(restore_target) with open(restore_target, "rb") as fp: restored_data = fp.read() assert content == restored_data
def check_and_upload_missing_local_files(self): current_wal_file = self.get_current_wal_file() first_required_wal_file, _ = self.get_first_required_wal_segment() # Find relevant WAL files. We do this by checking archival status # of all WAL files older than the one currently open (ie reverse # sorted list from newest file that should've been archived to the # oldest on disk) and and appending missing files to a list. After # collecting a list we start archiving them from oldest to newest. # This is done so we don't break our missing archive detection logic # if sync is interrupted for some reason. # Sort all timeline files first to make sure they're always # archived, otherwise the timeline files are processed only after # all WAL files for a given timeline have been handled. wal_dir = get_pg_wal_directory(self.backup_site) wal_files = os.listdir(wal_dir) wal_files.sort(key=lambda f: (f.endswith(".history"), f), reverse=True) need_archival = [] for wal_file in wal_files: archive_type = None if wal.TIMELINE_RE.match(wal_file): # We want all timeline files archive_type = "TIMELINE" elif not wal.WAL_RE.match(wal_file): pass # not a WAL or timeline file elif wal_file == current_wal_file: self.log.info("Skipping currently open WAL file %r", wal_file) elif wal_file > current_wal_file: self.log.debug("Skipping recycled WAL file %r", wal_file) elif first_required_wal_file is not None and wal_file < first_required_wal_file: self.log.info( "WAL file %r is not needed for the latest basebackup", wal_file) break else: # WAL file in range first_required_wal_file .. current_wal_file archive_type = "WAL" if archive_type: resp = requests.head("{base}/archive/{file}".format( base=self.base_url, file=wal_file)) if resp.status_code == 200: self.log.info("%s file %r already archived", archive_type, wal_file) continue self.log.info("%s file %r needs to be archived", archive_type, wal_file) need_archival.append(wal_file) for wal_file in sorted(need_archival): # sort oldest to newest resp = requests.put("{base}/archive/{file}".format( base=self.base_url, file=wal_file)) archive_type = "TIMELINE" if ".history" in wal_file else "WAL" if resp.status_code != 201: self.log.error( "%s file %r archival failed with status code %r", archive_type, wal_file, resp.status_code) else: self.log.info("%s file %r archived", archive_type, wal_file)
def test_get_archived_file(self, pghoard): wal_seg_prev_tli = "00000001000000000000000F" wal_seg = "00000002000000000000000F" wal_file = "xlog/{}".format(wal_seg) # NOTE: create WAL header for the "previous" timeline, this should be accepted content = wal_header_for_file(wal_seg_prev_tli) wal_dir = get_pg_wal_directory( pghoard.config["backup_sites"][pghoard.test_site]) archive_path = os.path.join(pghoard.test_site, wal_file) compressor = pghoard.Compressor() compressed_content = compressor.compress(content) + ( compressor.flush() or b"") metadata = { "compression-algorithm": pghoard.config["compression"]["algorithm"], "original-file-size": len(content), } store = pghoard.transfer_agents[0].get_object_storage( pghoard.test_site) store.store_file_from_memory(archive_path, compressed_content, metadata=metadata) restore_command(site=pghoard.test_site, xlog=wal_seg, output=None, host="127.0.0.1", port=pghoard.config["http_port"]) restore_target = os.path.join(wal_dir, wal_seg) restore_command(site=pghoard.test_site, xlog=wal_seg, output=restore_target, host="127.0.0.1", port=pghoard.config["http_port"]) assert os.path.exists(restore_target) is True with open(restore_target, "rb") as fp: restored_data = fp.read() assert content == restored_data # test the same thing using restore as 'pghoard_postgres_command' tmp_out = os.path.join(wal_dir, restore_target + ".cmd") postgres_command.main([ "--host", "localhost", "--port", str(pghoard.config["http_port"]), "--site", pghoard.test_site, "--mode", "restore", "--output", tmp_out, "--xlog", wal_seg, ]) with open(tmp_out, "rb") as fp: restored_data = fp.read() assert content == restored_data
def _create_fetch_operation(self, key, site, filetype, obname, max_age=-1, suppress_error=True): with self.server.lock: # Don't fetch again if we already have pending fetch operation unless the operation # has been ongoing longer than given max age and has potentially became stale existing = self.server.pending_download_ops.get(key) if existing and ( max_age < 0 or time.monotonic() - existing["started_at"] <= max_age): return xlog_dir = get_pg_wal_directory( self.server.config["backup_sites"][site]) prefetch_target_path = os.path.join( xlog_dir, "{}.pghoard.prefetch".format(obname)) if os.path.exists(prefetch_target_path): return try: fd, tmp_target_path = tempfile.mkstemp(prefix="{}/{}.".format( xlog_dir, obname), suffix=".pghoard.tmp") os.close(fd) except OSError as ex: self.server.log.error( "Unable to create temporary file to fetch %r: %s: %s", obname, ex.__class__.__name__, ex) if suppress_error: return else: raise HttpResponse( "Unable to create temporary file for {0!r}: {1.__class__.__name__}: {1}" .format(key, ex), status=400) self.server.log.debug( "Fetching site: %r, filename: %r, filetype: %r, tmp_target_path: %r", site, obname, filetype, tmp_target_path) target_path = os.path.join(xlog_dir, "{}.pghoard.prefetch".format(obname)) self.server.pending_download_ops[key] = dict( started_at=time.monotonic(), target_path=target_path, ) self.server.transfer_queue.put({ "callback_queue": self.server.download_results, "filetype": filetype, "local_path": obname, "opaque": key, "site": site, "target_path": tmp_target_path, "type": "DOWNLOAD", })
def test_wal_fetch_optimization(self, pghoard): # inject fake WAL and timeline files for testing invalid_wal_name = "000000060000000000000001" valid_wal_name = "000000060000000000000002" wal_name_output = "optimization_output_filename" output_path = os.path.join( get_pg_wal_directory( pghoard.config["backup_sites"][pghoard.test_site]), wal_name_output) invalid_wal_path = os.path.join( get_pg_wal_directory( pghoard.config["backup_sites"][pghoard.test_site]), invalid_wal_name) valid_wal_path = os.path.join( get_pg_wal_directory( pghoard.config["backup_sites"][pghoard.test_site]), valid_wal_name) with open(valid_wal_path, "wb") as out_file: out_file.write( wal_header_for_file(os.path.basename(valid_wal_path))) with open(invalid_wal_path, "wb") as out_file: # We use the wrong WAL file's name to generate the header on purpose to see that our check works out_file.write( wal_header_for_file(os.path.basename(valid_wal_path))) restore_command(site=pghoard.test_site, xlog=os.path.basename(valid_wal_name), host="127.0.0.1", port=pghoard.config["http_port"], output=output_path, retry_interval=0.1) assert os.path.exists(output_path) os.unlink(output_path) with pytest.raises(postgres_command.PGCError): restore_command(site=pghoard.test_site, xlog=os.path.basename(invalid_wal_name), host="127.0.0.1", port=pghoard.config["http_port"], output=output_path, retry_interval=0.1) assert not os.path.exists(output_path) os.unlink(invalid_wal_path)
def handle_archival_request(self, site, filename, filetype): if filetype == "basebackup": # Request a basebackup to be made for site self.server.log.debug( "Requesting a new basebackup for site: %r to be made", site) self.server.requested_basebackup_sites.add(site) raise HttpResponse(status=201) start_time = time.time() site_config = self.server.config["backup_sites"][site] xlog_dir = get_pg_wal_directory(site_config) xlog_path = os.path.join(xlog_dir, filename) self.server.log.debug("Got request to archive: %r %r %r, %r", site, filetype, filename, xlog_path) if not os.path.exists(xlog_path): self.server.log.debug( "xlog_path: %r did not exist, cannot archive, returning 404", xlog_path) raise HttpResponse("N/A", status=404) self._verify_wal(filetype, filename, xlog_path) callback_queue = Queue() if not self.server.config["backup_sites"][site]["object_storage"]: compress_to_memory = False else: compress_to_memory = True compression_event = { "type": "MOVE", "callback_queue": callback_queue, "compress_to_memory": compress_to_memory, "delete_file_after_compression": False, "full_path": xlog_path, "site": site, "src_path": "{}.partial".format(xlog_path), } self.server.compression_queue.put(compression_event) try: response = callback_queue.get(timeout=30) self.server.log.debug( "Handled an archival request for: %r %r, took: %.3fs", site, xlog_path, time.time() - start_time) except Empty: self.server.log.exception( "Problem in getting a response in time, returning 404, took: %.2fs", time.time() - start_time) raise HttpResponse("TIMEOUT", status=500) if not response["success"]: raise HttpResponse(status=500) raise HttpResponse(status=201)
def test_wal_fetch_optimization(self, pghoard): # inject fake WAL and timeline files for testing invalid_wal_name = "000000060000000000000001" valid_wal_name = "000000060000000000000002" wal_name_output = "optimization_output_filename" output_path = os.path.join( get_pg_wal_directory(pghoard.config["backup_sites"][pghoard.test_site]), wal_name_output) invalid_wal_path = os.path.join( get_pg_wal_directory(pghoard.config["backup_sites"][pghoard.test_site]), invalid_wal_name) valid_wal_path = os.path.join( get_pg_wal_directory(pghoard.config["backup_sites"][pghoard.test_site]), valid_wal_name) with open(valid_wal_path, "wb") as out_file: out_file.write(wal_header_for_file(os.path.basename(valid_wal_path))) with open(invalid_wal_path, "wb") as out_file: # We use the wrong WAL file's name to generate the header on purpose to see that our check works out_file.write(wal_header_for_file(os.path.basename(valid_wal_path))) restore_command( site=pghoard.test_site, xlog=os.path.basename(valid_wal_name), host="127.0.0.1", port=pghoard.config["http_port"], output=output_path, retry_interval=0.1) assert os.path.exists(output_path) os.unlink(output_path) with pytest.raises(postgres_command.PGCError): restore_command( site=pghoard.test_site, xlog=os.path.basename(invalid_wal_name), host="127.0.0.1", port=pghoard.config["http_port"], output=output_path, retry_interval=0.1) assert not os.path.exists(output_path) os.unlink(invalid_wal_path)
def test_archiving(self, pghoard): store = pghoard.transfer_agents[0].get_object_storage(pghoard.test_site) # inject fake WAL and timeline files for testing for xlog_type, wal_name in [("xlog", "0000000000000000000000CC"), ("timeline", "0000000F.history")]: foo_path = os.path.join(get_pg_wal_directory(pghoard.config["backup_sites"][pghoard.test_site]), wal_name) with open(foo_path, "wb") as out_file: if xlog_type == "xlog": out_file.write(wal_header_for_file(wal_name)) else: out_file.write(b"1 2 3\n") archive_command(host="localhost", port=pghoard.config["http_port"], site=pghoard.test_site, xlog=wal_name) archive_path = os.path.join(pghoard.test_site, xlog_type, wal_name) store.get_metadata_for_key(archive_path) store.delete_key(archive_path) os.unlink(foo_path)
def handle_archival_request(self, site, filename, filetype): if filetype == "basebackup": # Request a basebackup to be made for site self.server.log.debug("Requesting a new basebackup for site: %r to be made", site) self.server.requested_basebackup_sites.add(site) raise HttpResponse(status=201) start_time = time.time() site_config = self.server.config["backup_sites"][site] xlog_dir = get_pg_wal_directory(site_config) xlog_path = os.path.join(xlog_dir, filename) self.server.log.debug("Got request to archive: %r %r %r, %r", site, filetype, filename, xlog_path) if not os.path.exists(xlog_path): self.server.log.debug("xlog_path: %r did not exist, cannot archive, returning 404", xlog_path) raise HttpResponse("N/A", status=404) self._verify_wal(filetype, filename, xlog_path) callback_queue = Queue() if not self.server.config["backup_sites"][site]["object_storage"]: compress_to_memory = False else: compress_to_memory = True compression_event = { "type": "MOVE", "callback_queue": callback_queue, "compress_to_memory": compress_to_memory, "delete_file_after_compression": False, "full_path": xlog_path, "site": site, "src_path": "{}.partial".format(xlog_path), } self.server.compression_queue.put(compression_event) try: response = callback_queue.get(timeout=30) self.server.log.debug("Handled an archival request for: %r %r, took: %.3fs", site, xlog_path, time.time() - start_time) except Empty: self.server.log.exception("Problem in getting a response in time, returning 404, took: %.2fs", time.time() - start_time) raise HttpResponse("TIMEOUT", status=500) if not response["success"]: raise HttpResponse(status=500) raise HttpResponse(status=201)
def test_archive_command_with_invalid_file(self, pghoard): # only WAL and timeline (.history) files can be archived bl_label = "000000010000000000000002.00000028.backup" bl_file = "xlog/{}".format(bl_label) wal_path = os.path.join(get_pg_wal_directory(pghoard.config["backup_sites"][pghoard.test_site]), bl_label) backup_wal_path = os.path.join(pghoard.config["backup_location"], pghoard.test_site, bl_file) with open(wal_path, "w") as fp: fp.write("jee") # backup labels are ignored - archiving returns success but file won't appear on disk archive_command(host="127.0.0.1", port=pghoard.config["http_port"], site=pghoard.test_site, xlog=bl_label) assert not os.path.exists(backup_wal_path) # any other files raise an error with pytest.raises(postgres_command.PGCError) as excinfo: archive_command(host="127.0.0.1", port=pghoard.config["http_port"], site=pghoard.test_site, xlog=bl_label + ".x") assert excinfo.value.exit_code == postgres_command.EXIT_ARCHIVE_FAIL assert not os.path.exists(backup_wal_path + ".x")
def test_archiving(self, pghoard): store = pghoard.transfer_agents[0].get_object_storage(pghoard.test_site) # inject fake WAL and timeline files for testing for xlog_type, wal_name in [ ("xlog", "0000000000000000000000CC"), ("timeline", "0000000F.history")]: foo_path = os.path.join(get_pg_wal_directory(pghoard.config["backup_sites"][pghoard.test_site]), wal_name) with open(foo_path, "wb") as out_file: if xlog_type == "xlog": out_file.write(wal_header_for_file(wal_name)) else: out_file.write(b"1 2 3\n") archive_command(host="localhost", port=pghoard.config["http_port"], site=pghoard.test_site, xlog=wal_name) archive_path = os.path.join(pghoard.test_site, xlog_type, wal_name) store.get_metadata_for_key(archive_path) store.delete_key(archive_path) os.unlink(foo_path)
def _create_prefetch_operations(self, site, filetype, filename): if filetype not in {"timeline", "xlog"}: return prefetch_n = self.server.config["restore_prefetch"] if prefetch_n <= 0: return xlog_dir = get_pg_wal_directory( self.server.config["backup_sites"][site]) names = [] if filetype == "timeline": tli_num = int(filename.replace(".history", ""), 16) for _ in range(prefetch_n): tli_num += 1 prefetch_name = "{:08X}.history".format(tli_num) if os.path.isfile(os.path.join(xlog_dir, prefetch_name)): continue names.append(prefetch_name) elif filetype == "xlog": xlog_num = int(filename, 16) for _ in range(prefetch_n): if xlog_num & 0xFF == 0xFF: xlog_num += 0xFFFFFF00 xlog_num += 1 prefetch_name = "{:024X}".format(xlog_num) xlog_path = os.path.join(xlog_dir, prefetch_name) if os.path.isfile(xlog_path): try: wal.verify_wal(wal_name=prefetch_name, filepath=xlog_path) continue except ValueError as e: self.server.log.debug( "(Prefetch) File %s already exists but is invalid: %r", xlog_path, e) names.append(prefetch_name) for obname in names: key = self._make_file_key(site, filetype, obname) if key in self.server.prefetch_404: continue # previously failed to prefetch this file, don't try again self._create_fetch_operation(key, site, filetype, obname)
def _create_fetch_operation(self, key, site, filetype, obname, max_age=-1, suppress_error=True): with self.server.lock: # Don't fetch again if we already have pending fetch operation unless the operation # has been ongoing longer than given max age and has potentially became stale existing = self.server.pending_download_ops.get(key) if existing and (max_age < 0 or time.monotonic() - existing["started_at"] <= max_age): return xlog_dir = get_pg_wal_directory(self.server.config["backup_sites"][site]) prefetch_target_path = os.path.join(xlog_dir, "{}.pghoard.prefetch".format(obname)) if os.path.exists(prefetch_target_path): return try: fd, tmp_target_path = tempfile.mkstemp(prefix="{}/{}.".format(xlog_dir, obname), suffix=".pghoard.tmp") os.close(fd) except OSError as ex: self.server.log.error("Unable to create temporary file to fetch %r: %s: %s", obname, ex.__class__.__name__, ex) if suppress_error: return else: raise HttpResponse("Unable to create temporary file for {0!r}: {1.__class__.__name__}: {1}" .format(key, ex), status=400) self.server.log.debug("Fetching site: %r, filename: %r, filetype: %r, tmp_target_path: %r", site, obname, filetype, tmp_target_path) target_path = os.path.join(xlog_dir, "{}.pghoard.prefetch".format(obname)) self.server.pending_download_ops[key] = dict( started_at=time.monotonic(), target_path=target_path, ) self.server.transfer_queue.put({ "callback_queue": self.server.download_results, "filetype": filetype, "local_path": obname, "opaque": key, "site": site, "target_path": tmp_target_path, "type": "DOWNLOAD", })
def test_get_archived_file(self, pghoard): wal_seg_prev_tli = "00000001000000000000000F" wal_seg = "00000002000000000000000F" wal_file = "xlog/{}".format(wal_seg) # NOTE: create WAL header for the "previous" timeline, this should be accepted content = wal_header_for_file(wal_seg_prev_tli) wal_dir = get_pg_wal_directory(pghoard.config["backup_sites"][pghoard.test_site]) archive_path = os.path.join(pghoard.test_site, wal_file) compressor = pghoard.Compressor() compressed_content = compressor.compress(content) + (compressor.flush() or b"") metadata = { "compression-algorithm": pghoard.config["compression"]["algorithm"], "original-file-size": len(content), } store = pghoard.transfer_agents[0].get_object_storage(pghoard.test_site) store.store_file_from_memory(archive_path, compressed_content, metadata=metadata) restore_command(site=pghoard.test_site, xlog=wal_seg, output=None, host="127.0.0.1", port=pghoard.config["http_port"]) restore_target = os.path.join(wal_dir, wal_seg) restore_command(site=pghoard.test_site, xlog=wal_seg, output=restore_target, host="127.0.0.1", port=pghoard.config["http_port"]) assert os.path.exists(restore_target) is True with open(restore_target, "rb") as fp: restored_data = fp.read() assert content == restored_data # test the same thing using restore as 'pghoard_postgres_command' tmp_out = os.path.join(wal_dir, restore_target + ".cmd") postgres_command.main([ "--host", "localhost", "--port", str(pghoard.config["http_port"]), "--site", pghoard.test_site, "--mode", "restore", "--output", tmp_out, "--xlog", wal_seg, ]) with open(tmp_out, "rb") as fp: restored_data = fp.read() assert content == restored_data
def _create_prefetch_operations(self, site, filetype, filename): if filetype not in {"timeline", "xlog"}: return prefetch_n = self.server.config["restore_prefetch"] if prefetch_n <= 0: return xlog_dir = get_pg_wal_directory(self.server.config["backup_sites"][site]) names = [] if filetype == "timeline": tli_num = int(filename.replace(".history", ""), 16) for _ in range(prefetch_n): tli_num += 1 prefetch_name = "{:08X}.history".format(tli_num) if os.path.isfile(os.path.join(xlog_dir, prefetch_name)): continue names.append(prefetch_name) elif filetype == "xlog": xlog_num = int(filename, 16) for _ in range(prefetch_n): if xlog_num & 0xFF == 0xFF: xlog_num += 0xFFFFFF00 xlog_num += 1 prefetch_name = "{:024X}".format(xlog_num) xlog_path = os.path.join(xlog_dir, prefetch_name) if os.path.isfile(xlog_path): try: wal.verify_wal(wal_name=prefetch_name, filepath=xlog_path) continue except ValueError as e: self.server.log.debug("(Prefetch) File %s already exists but is invalid: %r", xlog_path, e) names.append(prefetch_name) for obname in names: key = self._make_file_key(site, filetype, obname) if key in self.server.prefetch_404: continue # previously failed to prefetch this file, don't try again self._create_fetch_operation(key, site, filetype, obname)
def check_and_upload_missing_local_files(self, max_hash_checks): current_wal_file = self.get_current_wal_file() first_required_wal_file, _ = self.get_first_required_wal_segment() # Find relevant WAL files. We do this by checking archival status # of all WAL files older than the one currently open (ie reverse # sorted list from newest file that should've been archived to the # oldest on disk) and and appending missing files to a list. After # collecting a list we start archiving them from oldest to newest. # This is done so we don't break our missing archive detection logic # if sync is interrupted for some reason. # Sort all timeline files first to make sure they're always # archived, otherwise the timeline files are processed only after # all WAL files for a given timeline have been handled. wal_dir = get_pg_wal_directory(self.backup_site) wal_files = os.listdir(wal_dir) wal_files.sort(key=lambda f: (f.endswith(".history"), f), reverse=True) need_archival = [] hash_checks_done = 0 existing_wal_without_checksum_count = 0 for wal_file in wal_files: archive_type = None if wal.TIMELINE_RE.match(wal_file): # We want all timeline files archive_type = "TIMELINE" elif not wal.WAL_RE.match(wal_file): pass # not a WAL or timeline file elif wal_file == current_wal_file: self.log.info("Skipping currently open WAL file %r", wal_file) elif wal_file > current_wal_file: self.log.debug("Skipping recycled WAL file %r", wal_file) elif first_required_wal_file is not None and wal_file < first_required_wal_file: self.log.info("WAL file %r is not needed for the latest basebackup", wal_file) break else: # WAL file in range first_required_wal_file .. current_wal_file archive_type = "WAL" if archive_type: resp = requests.head("{base}/archive/{file}".format(base=self.base_url, file=wal_file)) if resp.status_code == 200: remote_hash = resp.headers.get("metadata-hash") hash_algorithm = resp.headers.get("metadata-hash-algorithm") check_hash = bool( archive_type == "WAL" and (hash_checks_done < max_hash_checks or max_hash_checks < 0) and remote_hash ) if archive_type == "WAL" and not remote_hash: # If we don't have hashes available (old pghoard was running on previous master), re-upload first # file that already exists in remote storage and doesn't have a checksum since it might be the last # WAL file of previous timeline uploaded by old master and invalid because it doesn't have the # timeline switch event and have some writes that are not valid for our timeline existing_wal_without_checksum_count += 1 if existing_wal_without_checksum_count == 1: self.log.info( "%s file %r already archived but no hash is available, reuploading", archive_type, wal_file ) need_archival.append(wal_file) continue if check_hash: hash_checks_done += 1 our_hash = self.calculate_hash(os.path.join(wal_dir, wal_file), hash_algorithm) if remote_hash.lower().strip() != our_hash.lower().strip(): self.log.warning( "%s file %r already archived but existing hash %r differs from our hash %r, reuploading", archive_type, wal_file, remote_hash, our_hash ) need_archival.append(wal_file) else: self.log.info("%s file %r already archived and has valid hash", archive_type, wal_file) else: self.log.info("%s file %r already archived", archive_type, wal_file) continue self.log.info("%s file %r needs to be archived", archive_type, wal_file) need_archival.append(wal_file) for wal_file in sorted(need_archival): # sort oldest to newest resp = requests.put("{base}/archive/{file}".format(base=self.base_url, file=wal_file)) archive_type = "TIMELINE" if ".history" in wal_file else "WAL" if resp.status_code != 201: self.log.error("%s file %r archival failed with status code %r", archive_type, wal_file, resp.status_code) else: self.log.info("%s file %r archived", archive_type, wal_file)
def test_archive_sync(self, db, pghoard): log = logging.getLogger("test_archive_sync") store = pghoard.transfer_agents[0].get_object_storage(pghoard.test_site) def list_archive(folder): if folder == "timeline": matcher = wal.TIMELINE_RE.match else: matcher = wal.WAL_RE.match path_to_list = "{}/{}".format(pghoard.test_site, folder) files_found, files_total = 0, 0 for obj in store.list_path(path_to_list): fname = os.path.basename(obj["name"]) files_total += 1 if matcher(fname): files_found += 1 yield fname log.info("Listed %r, %r out of %r matched %r pattern", path_to_list, files_found, files_total, folder) # create a basebackup to start with self._run_and_wait_basebackup(pghoard, db, "pipe") # force a couple of wal segment switches start_wal, _ = self._switch_wal(db, 4) # we should have at least 4 WAL files now (there may be more in # case other tests created them -- we share a single postresql # cluster between all tests) pg_wal_dir = get_pg_wal_directory(pghoard.config["backup_sites"][pghoard.test_site]) pg_wals = {f for f in os.listdir(pg_wal_dir) if wal.WAL_RE.match(f) and f > start_wal} assert len(pg_wals) >= 4 # create a couple of "recycled" xlog files that we must ignore last_wal = sorted(pg_wals)[-1] dummy_data = b"x" * (16 * 2 ** 20) def write_dummy_wal(inc): filename = "{:024X}".format((int(last_wal, 16) + inc)) print("Writing dummy WAL file", filename) open(os.path.join(pg_wal_dir, filename), "wb").write(dummy_data) return filename recycled1 = write_dummy_wal(1) recycled2 = write_dummy_wal(2) # check what we have archived, there should be at least the three # above WALs that are NOT there at the moment archived_wals = set(list_archive("xlog")) assert len(pg_wals - archived_wals) >= 4 # now perform an archive sync arsy = ArchiveSync() arsy.run(["--site", pghoard.test_site, "--config", pghoard.config_path]) # and now archive should include all our WALs archived_wals = set(list_archive("xlog")) # the recycled files must not appear in archived files assert recycled1 not in archived_wals assert recycled2 not in archived_wals # the regular wals must be archived assert archived_wals.issuperset(pg_wals) # if we delete a wal file that's not the latest archival it should # get synced to the archive as we don't have a basebackup newer than # it current_wal = arsy.get_current_wal_file() old_wals = sorted(wal for wal in pg_wals if wal < current_wal) store.delete_key(os.path.join(pghoard.test_site, "xlog", old_wals[-2])) arsy.run(["--site", pghoard.test_site, "--config", pghoard.config_path]) archived_wals = set(list_archive("xlog")) assert archived_wals.issuperset(pg_wals) # delete the topmost wal file, this should cause resync too store.delete_key(os.path.join(pghoard.test_site, "xlog", old_wals[-1])) arsy.run(["--site", pghoard.test_site, "--config", pghoard.config_path]) archived_wals = set(list_archive("xlog")) assert archived_wals.issuperset(pg_wals) # let's do a little dance to turn our DB into a standby and then # promote it, forcing a timeline switch db.kill(force=False) with open(os.path.join(db.pgdata, "recovery.conf"), "w") as fp: fp.write( "standby_mode = 'on'\n" "recovery_target_timeline = 'latest'\n" "restore_command = 'false'\n" ) # start PG and promote it db.run_pg() db.run_cmd("pg_ctl", "-D", db.pgdata, "promote") time.sleep(5) # TODO: instead of sleeping, poll the db until ready # we should have a single timeline file in pg_xlog/pg_wal now pg_wal_timelines = {f for f in os.listdir(pg_wal_dir) if wal.TIMELINE_RE.match(f)} assert len(pg_wal_timelines) > 0 # but there should be nothing archived as archive_command wasn't setup archived_timelines = set(list_archive("timeline")) assert len(archived_timelines) == 0 # let's hit archive sync arsy.run(["--site", pghoard.test_site, "--config", pghoard.config_path]) # now we should have an archived timeline archived_timelines = set(list_archive("timeline")) assert archived_timelines.issuperset(pg_wal_timelines) assert "00000002.history" in archived_timelines # let's take a new basebackup self._run_and_wait_basebackup(pghoard, db, "basic") # nuke archives and resync them for name in list_archive(folder="timeline"): store.delete_key(os.path.join(pghoard.test_site, "timeline", name)) for name in list_archive(folder="xlog"): store.delete_key(os.path.join(pghoard.test_site, "xlog", name)) self._switch_wal(db, 1) arsy.run(["--site", pghoard.test_site, "--config", pghoard.config_path]) archived_wals = set(list_archive("xlog")) # assume the same timeline file as before and one to three wal files assert len(archived_wals) >= 1 assert len(archived_wals) <= 3 archived_timelines = set(list_archive("timeline")) assert list(archived_timelines) == ["00000002.history"]
def get_wal_or_timeline_file(self, site, filename, filetype): target_path = self.headers.get("x-pghoard-target-path") if not target_path: raise HttpResponse( "x-pghoard-target-path header missing from download", status=400) # See if we have already prefetched the file site_config = self.server.config["backup_sites"][site] xlog_dir = get_pg_wal_directory(site_config) prefetch_target_path = os.path.join( xlog_dir, "{}.pghoard.prefetch".format(filename)) if os.path.exists(prefetch_target_path): self._save_and_verify_restored_file(filetype, filename, prefetch_target_path, target_path) raise HttpResponse(status=201) # After reaching a recovery_target and restart of a PG server, PG wants to replay and refetch # files from the archive starting from the latest checkpoint. We have potentially fetched these files # already earlier. Check if we have the files already and if we do, don't go over the network to refetch # them yet again but just rename them to the path that PG is requesting. xlog_path = os.path.join(xlog_dir, filename) if os.path.exists(xlog_path): self.server.log.info( "Requested %r, found it in pg_xlog directory as: %r, returning directly", filename, xlog_path) success = False try: self._save_and_verify_restored_file(filetype, filename, xlog_path, target_path) success = True except (ValueError, HttpResponse) as ex: self.server.log.warning( "Found file: %r but it was invalid: %s", xlog_path, ex) if success: raise HttpResponse(status=201) prefetch_n = self.server.config["restore_prefetch"] prefetch = [] if filetype == "timeline": tli_num = int(filename.replace(".history", ""), 16) for _ in range(prefetch_n): tli_num += 1 prefetch.append("{:08X}.history".format(tli_num)) elif filetype == "xlog": xlog_num = int(filename, 16) for _ in range(prefetch_n): if xlog_num & 0xFF == 0xFF: xlog_num += 0xFFFFFF00 xlog_num += 1 prefetch.append("{:024X}".format(xlog_num)) with self._prefetch(site, filetype, prefetch): self._transfer_agent_op(site, filename, filetype, "DOWNLOAD", target_path=target_path) raise HttpResponse(status=201)
def _prefetch(self, site, filetype, names): if not names: return start_time = time.monotonic() callback_queue = Queue() site_config = self.server.config["backup_sites"][site] xlog_dir = get_pg_wal_directory(site_config) downloads = {} for obname in names: if obname in self.server.prefetch_404: continue # previously failed to prefetch this file, don't try again prefetch_target_path = os.path.join( xlog_dir, "{}.pghoard.prefetch".format(obname)) if os.path.exists(prefetch_target_path): continue # already fetched this file try: fd, tmp_target_path = tempfile.mkstemp(prefix="{}/{}.".format( xlog_dir, obname), suffix=".pghoard.tmp") os.close(fd) except OSError as ex: self.server.log.error( "Unable to create temporary file to prefetch %r: %s: %s", obname, ex.__class__.__name__, ex) continue self.server.log.debug( "Prefetching site: %r, filename: %r, filetype: %r, tmp_target_path: %r", site, obname, filetype, tmp_target_path) downloads[obname] = tmp_target_path self.server.transfer_queue.put({ "callback_queue": callback_queue, "filetype": filetype, "local_path": obname, "opaque": obname, "site": site, "target_path": tmp_target_path, "type": "DOWNLOAD", }) # allow something else to happen try: yield finally: # process results (timeout is 30 seconds after start but at least 5 seconds) timeout_at = max(start_time + 30, time.monotonic() + 5) while downloads: time_left = timeout_at - time.monotonic() try: response = callback_queue.get(timeout=time_left) except Empty: break # timeout obname = response["opaque"] tmp_target_path = downloads.pop(response["opaque"]) if response["success"]: prefetch_target_path = os.path.join( xlog_dir, "{}.pghoard.prefetch".format(obname)) os.rename(tmp_target_path, prefetch_target_path) self.server.log.debug( "Prefetched %r %r to %r, took: %.3fs", site, obname, prefetch_target_path, time.monotonic() - start_time) else: ex = response.get("exception", Error) if isinstance(ex, FileNotFoundFromStorageError): # don't try prefetching this file again self.server.prefetch_404.append(obname) self.server.log.debug( "Prefetching %r %r failed (%s), took: %.3fs", site, obname, ex.__class__.__name__, time.monotonic() - start_time) with suppress(Exception): os.unlink(tmp_target_path) # everything else timed out while downloads: obname, tmp_target_path = downloads.popitem() self.server.log.debug( "Prefetching %r %r timed out, took: %.3fs", site, obname, time.monotonic() - start_time) with suppress(Exception): os.unlink(tmp_target_path)
def get_wal_or_timeline_file(self, site, filename, filetype): target_path = self.headers.get("x-pghoard-target-path") if not target_path: raise HttpResponse("x-pghoard-target-path header missing from download", status=400) self._process_completed_download_operations() # See if we have already prefetched the file site_config = self.server.config["backup_sites"][site] xlog_dir = get_pg_wal_directory(site_config) prefetch_target_path = os.path.join(xlog_dir, "{}.pghoard.prefetch".format(filename)) if os.path.exists(prefetch_target_path): ex = self._try_save_and_verify_restored_file(filetype, filename, prefetch_target_path, target_path) if not ex: self._create_prefetch_operations(site, filetype, filename) raise HttpResponse(status=201) # After reaching a recovery_target and restart of a PG server, PG wants to replay and refetch # files from the archive starting from the latest checkpoint. We have potentially fetched these files # already earlier. Check if we have the files already and if we do, don't go over the network to refetch # them yet again but just rename them to the path that PG is requesting. xlog_path = os.path.join(xlog_dir, filename) if os.path.exists(xlog_path): self.server.log.info("Requested %r, found it in pg_xlog directory as: %r, returning directly", filename, xlog_path) ex = self._try_save_and_verify_restored_file(filetype, filename, xlog_path, target_path, unlink=False) if ex: self.server.log.warning("Found file: %r but it was invalid: %s", xlog_path, ex) else: raise HttpResponse(status=201) key = self._make_file_key(site, filetype, filename) with suppress(ValueError): self.server.prefetch_404.remove(key) self._create_fetch_operation(key, site, filetype, filename, max_age=5, suppress_error=False) self._create_prefetch_operations(site, filetype, filename) last_schedule_call = time.monotonic() start_time = time.monotonic() retries = 2 while (time.monotonic() - start_time) <= 30: self._process_completed_download_operations(timeout=0.01) with self.server.lock: if os.path.isfile(prefetch_target_path): ex = self._try_save_and_verify_restored_file(filetype, filename, prefetch_target_path, target_path) if not ex: raise HttpResponse(status=201) elif ex and retries == 0: raise ex # pylint: disable=raising-bad-type retries -= 1 if key in self.server.prefetch_404: raise HttpResponse(status=404) with self.server.lock: if key not in self.server.pending_download_ops: if retries == 0: raise HttpResponse(status=500) retries -= 1 self._create_fetch_operation(key, site, filetype, filename, suppress_error=False) if time.monotonic() - last_schedule_call >= 1: last_schedule_call = time.monotonic() # Replace existing download operation if it has been executing for too long self._create_fetch_operation(key, site, filetype, filename, max_age=10, suppress_error=False) raise HttpResponse("TIMEOUT", status=500)
def get_wal_or_timeline_file(self, site, filename, filetype): target_path = self.headers.get("x-pghoard-target-path") if not target_path: raise HttpResponse( "x-pghoard-target-path header missing from download", status=400) self._process_completed_download_operations() # See if we have already prefetched the file site_config = self.server.config["backup_sites"][site] xlog_dir = get_pg_wal_directory(site_config) prefetch_target_path = os.path.join( xlog_dir, "{}.pghoard.prefetch".format(filename)) if os.path.exists(prefetch_target_path): ex = self._try_save_and_verify_restored_file( filetype, filename, prefetch_target_path, target_path) if not ex: self._create_prefetch_operations(site, filetype, filename) raise HttpResponse(status=201) # After reaching a recovery_target and restart of a PG server, PG wants to replay and refetch # files from the archive starting from the latest checkpoint. We have potentially fetched these files # already earlier. Check if we have the files already and if we do, don't go over the network to refetch # them yet again but just rename them to the path that PG is requesting. xlog_path = os.path.join(xlog_dir, filename) if os.path.exists(xlog_path): self.server.log.info( "Requested %r, found it in pg_xlog directory as: %r, returning directly", filename, xlog_path) ex = self._try_save_and_verify_restored_file(filetype, filename, xlog_path, target_path, unlink=False) if ex: self.server.log.warning( "Found file: %r but it was invalid: %s", xlog_path, ex) else: raise HttpResponse(status=201) key = self._make_file_key(site, filetype, filename) with suppress(ValueError): self.server.prefetch_404.remove(key) self._create_fetch_operation(key, site, filetype, filename, max_age=5, suppress_error=False) self._create_prefetch_operations(site, filetype, filename) last_schedule_call = time.monotonic() start_time = time.monotonic() retries = 2 while (time.monotonic() - start_time) <= 30: self._process_completed_download_operations(timeout=0.01) with self.server.lock: if os.path.isfile(prefetch_target_path): ex = self._try_save_and_verify_restored_file( filetype, filename, prefetch_target_path, target_path) if not ex: raise HttpResponse(status=201) elif ex and retries == 0: raise ex # pylint: disable=raising-bad-type retries -= 1 if key in self.server.prefetch_404: raise HttpResponse(status=404) with self.server.lock: if key not in self.server.pending_download_ops: if retries == 0: raise HttpResponse(status=500) retries -= 1 self._create_fetch_operation(key, site, filetype, filename, suppress_error=False) if time.monotonic() - last_schedule_call >= 1: last_schedule_call = time.monotonic() # Replace existing download operation if it has been executing for too long self._create_fetch_operation(key, site, filetype, filename, max_age=10, suppress_error=False) raise HttpResponse("TIMEOUT", status=500)
def http_restore(pghoard): pgdata = get_pg_wal_directory(pghoard.config["backup_sites"][pghoard.test_site]) return HTTPRestore("localhost", pghoard.config["http_port"], site=pghoard.test_site, pgdata=pgdata)
def check_and_upload_missing_local_files(self, max_hash_checks): current_wal_file = self.get_current_wal_file() first_required_wal_file, _ = self.get_first_required_wal_segment() # Find relevant WAL files. We do this by checking archival status # of all WAL files older than the one currently open (ie reverse # sorted list from newest file that should've been archived to the # oldest on disk) and and appending missing files to a list. After # collecting a list we start archiving them from oldest to newest. # This is done so we don't break our missing archive detection logic # if sync is interrupted for some reason. # Sort all timeline files first to make sure they're always # archived, otherwise the timeline files are processed only after # all WAL files for a given timeline have been handled. wal_dir = get_pg_wal_directory(self.backup_site) wal_files = os.listdir(wal_dir) wal_files.sort(key=lambda f: (f.endswith(".history"), f), reverse=True) need_archival = [] hash_checks_done = 0 existing_wal_without_checksum_count = 0 for wal_file in wal_files: archive_type = None if wal.TIMELINE_RE.match(wal_file): # We want all timeline files archive_type = "TIMELINE" elif not wal.WAL_RE.match(wal_file): pass # not a WAL or timeline file elif wal_file == current_wal_file: self.log.info("Skipping currently open WAL file %r", wal_file) elif wal_file > current_wal_file: self.log.debug("Skipping recycled WAL file %r", wal_file) elif first_required_wal_file is not None and wal_file < first_required_wal_file: self.log.info( "WAL file %r is not needed for the latest basebackup", wal_file) break else: # WAL file in range first_required_wal_file .. current_wal_file archive_type = "WAL" if archive_type: resp = requests.head("{base}/archive/{file}".format( base=self.base_url, file=wal_file)) if resp.status_code == 200: remote_hash = resp.headers.get("metadata-hash") hash_algorithm = resp.headers.get( "metadata-hash-algorithm") check_hash = bool(archive_type == "WAL" and (hash_checks_done < max_hash_checks or max_hash_checks < 0) and remote_hash) if archive_type == "WAL" and not remote_hash: # If we don't have hashes available (old pghoard was running on previous master), re-upload first # file that already exists in remote storage and doesn't have a checksum since it might be the last # WAL file of previous timeline uploaded by old master and invalid because it doesn't have the # timeline switch event and have some writes that are not valid for our timeline existing_wal_without_checksum_count += 1 if existing_wal_without_checksum_count == 1: self.log.info( "%s file %r already archived but no hash is available, reuploading", archive_type, wal_file) need_archival.append(wal_file) continue if check_hash: hash_checks_done += 1 our_hash = self.calculate_hash( os.path.join(wal_dir, wal_file), hash_algorithm) if not our_hash: self.log.info( "%s file %r already archived (file deleted before getting hash)", archive_type, wal_file) elif remote_hash.lower().strip() != our_hash.lower( ).strip(): self.log.warning( "%s file %r already archived but existing hash %r differs from our hash %r, reuploading", archive_type, wal_file, remote_hash, our_hash) need_archival.append(wal_file) else: self.log.info( "%s file %r already archived and has valid hash", archive_type, wal_file) else: self.log.info("%s file %r already archived", archive_type, wal_file) continue self.log.info("%s file %r needs to be archived", archive_type, wal_file) need_archival.append(wal_file) for wal_file in sorted(need_archival): # sort oldest to newest resp = requests.put("{base}/archive/{file}".format( base=self.base_url, file=wal_file)) archive_type = "TIMELINE" if ".history" in wal_file else "WAL" if resp.status_code != 201: self.log.error( "%s file %r archival failed with status code %r", archive_type, wal_file, resp.status_code) else: self.log.info("%s file %r archived", archive_type, wal_file)