def wait_for_replicas(self, checkpoint_lsn): from patroni.utils import polling_loop logger.info('Waiting for replica nodes to catch up with primary') query = ( "SELECT pg_catalog.pg_{0}_{1}_diff(pg_catalog.pg_last_{0}_replay_{1}()," " '0/0')::bigint").format(self.postgresql.wal_name, self.postgresql.lsn_name) status = {} for _ in polling_loop(60): synced = True for name, (_, cur) in self.replica_connections.items(): prev = status.get(name) if prev and prev >= checkpoint_lsn: continue cur.execute(query) lsn = cur.fetchone()[0] status[name] = lsn if lsn < checkpoint_lsn: synced = False if synced: logger.info('All replicas are ready') return True for name in self.replica_connections.keys(): lsn = status.get(name) if not lsn or lsn < checkpoint_lsn: logger.error('Node %s did not catched up. Lag=%s', name, checkpoint_lsn - lsn)
def main(): from patroni.config import Config from patroni.utils import polling_loop from pg_upgrade import PostgresqlUpgrade config = Config() upgrade = PostgresqlUpgrade(config['postgresql']) bin_version = upgrade.get_binary_version() cluster_version = upgrade.get_cluster_version() if cluster_version == bin_version: return 0 logger.info('Cluster version: %s, bin version: %s', cluster_version, bin_version) assert float(cluster_version) < float(bin_version) upgrade.config['pg_ctl_timeout'] = 3600*24*7 logger.info('Trying to start the cluster with old postgres') if not upgrade.start_old_cluster(config['bootstrap'], cluster_version): raise Exception('Failed to start the cluster with old postgres') for _ in polling_loop(upgrade.config['pg_ctl_timeout'], 10): upgrade.reset_cluster_info_state() if upgrade.is_leader(): break logger.info('waiting for end of recovery of the old cluster') if not upgrade.run_bootstrap_post_init(config['bootstrap']): upgrade.stop(block_callbacks=True, checkpoint=False) raise Exception('Failed to run bootstrap.post_init') locale = upgrade.query('SHOW lc_collate').fetchone()[0] encoding = upgrade.query('SHOW server_encoding').fetchone()[0] initdb_config = [{'locale': locale}, {'encoding': encoding}] if upgrade.query("SELECT current_setting('data_checksums')::bool").fetchone()[0]: initdb_config.append('data-checksums') logger.info('Dropping objects from the cluster which could be incompatible') try: upgrade.drop_possibly_incompatible_objects() except Exception: upgrade.stop(block_callbacks=True, checkpoint=False) raise logger.info('Doing a clean shutdown of the cluster before pg_upgrade') if not upgrade.stop(block_callbacks=True, checkpoint=False): raise Exception('Failed to stop the cluster with old postgres') logger.info('initdb config: %s', initdb_config) logger.info('Executing pg_upgrade') if not upgrade.do_upgrade(bin_version, {'initdb': initdb_config}): raise Exception('Failed to upgrade cluster from {0} to {1}'.format(cluster_version, bin_version)) logger.info('Starting the cluster with new postgres after upgrade') if not upgrade.start(): raise Exception('Failed to start the cluster with new postgres') upgrade.analyze()
def wait_until_pause_is_applied(dcs, paused, old_cluster): click.echo( "'{0}' request sent, waiting until it is recognized by all nodes". format(paused and 'pause' or 'resume')) old = {m.name: m.index for m in old_cluster.members if m.api_url} loop_wait = old_cluster.config.data.get('loop_wait', dcs.loop_wait) for _ in polling_loop(loop_wait + 1): cluster = dcs.get_cluster() if all( m.data.get('pause', False) == paused for m in cluster.members if m.name in old): break else: remaining = [ m.name for m in cluster.members if m.data.get('pause', False) != paused and m.name in old and old[m.name] != m.index ] if remaining: return click.echo( "{0} members didn't recognized pause state after {1} seconds". format(', '.join(remaining), loop_wait)) return click.echo('Success: cluster management is {0}'.format( paused and 'paused' or 'resumed'))
def main(): from pg_upgrade import PostgresqlUpgrade from patroni.config import Config from patroni.utils import polling_loop from spilo_commons import get_binary_version config = Config(sys.argv[1]) upgrade = PostgresqlUpgrade(config) bin_version = get_binary_version(upgrade.pgcommand('')) cluster_version = upgrade.get_cluster_version() if cluster_version == bin_version: return 0 logger.info('Cluster version: %s, bin version: %s', cluster_version, bin_version) assert float(cluster_version) < float(bin_version) logger.info('Trying to start the cluster with old postgres') if not upgrade.start_old_cluster(config['bootstrap'], cluster_version): raise Exception('Failed to start the cluster with old postgres') for _ in polling_loop(upgrade.config.get('pg_ctl_timeout'), 10): upgrade.reset_cluster_info_state() if upgrade.is_leader(): break logger.info('waiting for end of recovery of the old cluster') if not upgrade.bootstrap.call_post_bootstrap(config['bootstrap']): upgrade.stop(block_callbacks=True, checkpoint=False) raise Exception('Failed to run bootstrap.post_init') if not upgrade.prepare_new_pgdata(bin_version): raise Exception('initdb failed') try: upgrade.drop_possibly_incompatible_objects() except Exception: upgrade.stop(block_callbacks=True, checkpoint=False) raise logger.info('Doing a clean shutdown of the cluster before pg_upgrade') if not upgrade.stop(block_callbacks=True, checkpoint=False): raise Exception('Failed to stop the cluster with old postgres') if not upgrade.do_upgrade(): raise Exception('Failed to upgrade cluster from {0} to {1}'.format( cluster_version, bin_version)) logger.info('Starting the cluster with new postgres after upgrade') if not upgrade.start(): raise Exception('Failed to start the cluster with new postgres') try: upgrade.update_extensions() except Exception as e: logger.error('Failed to update extensions: %r', e) upgrade.analyze()
def wait_end_of_recovery(postgresql): from patroni.utils import polling_loop for _ in polling_loop(postgresql.config.get('pg_ctl_timeout'), 10): postgresql.reset_cluster_info_state() if postgresql.is_leader(): break logger.info('waiting for end of recovery of the old cluster')
def main(): from patroni.config import Config from patroni.utils import polling_loop from pg_upgrade import PostgresqlUpgrade config = Config() upgrade = PostgresqlUpgrade(config['postgresql']) bin_version = upgrade.get_binary_version() cluster_version = upgrade.get_cluster_version() if cluster_version == bin_version: return 0 logger.info('Cluster version: %s, bin version: %s', cluster_version, bin_version) assert float(cluster_version) < float(bin_version) upgrade.set_bin_dir(cluster_version) upgrade.config['pg_ctl_timeout'] = 3600 * 24 * 7 upgrade.config['callbacks'] = {} bootstrap_config = config['bootstrap'] bootstrap_config[bootstrap_config['method']]['command'] = 'true' logger.info('Trying to start the cluster with old postgres') if not upgrade.bootstrap(bootstrap_config): raise Exception('Failed to start the cluster with old postgres') for _ in polling_loop(upgrade.config['pg_ctl_timeout'], 10): upgrade.reset_cluster_info_state() if upgrade.is_leader(): break logger.info('waiting for end of recovery of the old cluster') if not upgrade.run_bootstrap_post_init(bootstrap_config): raise Exception('Failed to run bootstrap.post_init') locale = upgrade.query('SHOW lc_collate').fetchone()[0] encoding = upgrade.query('SHOW server_encoding').fetchone()[0] initdb_config = [{'locale': locale}, {'encoding': encoding}] if upgrade.query('SHOW data_checksums').fetchone()[0]: initdb_config.append('data-checksums') logger.info('Doing a clean shutdown of the cluster before pg_upgrade') if not upgrade.stop(block_callbacks=True, checkpoint=False): raise Exception('Failed to stop the cluster with old postgres') logger.info('initdb config: %s', initdb_config) logger.info('Executing pg_upgrade') if not upgrade.do_upgrade(bin_version, {'initdb': initdb_config}): raise Exception('Failed to upgrade cluster from {0} to {1}'.format( cluster_version, bin_version)) logger.info('Starting the cluster with new postgres after upgrade') if not upgrade.start(): raise Exception('Failed to start the cluster with new postgres') upgrade.analyze()
def remove_initialize_key(self): from patroni.utils import polling_loop for _ in polling_loop(10): cluster = self.dcs.get_cluster() if cluster.initialize is None: return True logging.info('Removing initialize key') if self.dcs.cancel_initialization(): return True logger.error('Failed to remove initialize key')
def cancel(self): with self._lock: self._is_cancelled = True if self._process is None or not self._process.is_running(): return self._process.terminate() for _ in polling_loop(10): with self._lock: if self._process is None or not self._process.is_running(): return self._kill_process()
def cancel(self): with self._lock: self._is_cancelled = True if self._process is None or self._process.returncode is not None: return self._process.terminate() for _ in polling_loop(10): with self._lock: if self._process is None or self._process.returncode is not None: return with self._lock: if self._process is not None and self._process.returncode is None: self._process.kill()
def wait_replica_restart(self, member): from patroni.utils import polling_loop for _ in polling_loop(10): try: response = self.request(member, timeout=2, retries=0) if response.status == 200: data = json.loads(response.data.decode('utf-8')) database_system_identifier = data.get( 'database_system_identifier') if database_system_identifier and database_system_identifier != self._old_sysid: return member.name except Exception: pass logger.error('Patroni on replica %s was not restarted in 10 seconds', member.name)
def wait_until_pause_is_applied(dcs, paused, old_cluster): click.echo("'{0}' request sent, waiting until it is recognized by all nodes".format(paused and 'pause' or 'resume')) old = {m.name: m.index for m in old_cluster.members if m.api_url} loop_wait = old_cluster.config.data.get('loop_wait', dcs.loop_wait) for _ in polling_loop(loop_wait + 1): cluster = dcs.get_cluster() if all(m.data.get('pause', False) == paused for m in cluster.members if m.name in old): break else: remaining = [m.name for m in cluster.members if m.data.get('pause', False) != paused and m.name in old and old[m.name] != m.index] if remaining: return click.echo("{0} members didn't recognized pause state after {1} seconds" .format(', '.join(remaining), loop_wait)) return click.echo('Success: cluster management is {0}'.format(paused and 'paused' or 'resumed'))
def cancel(self, kill=False): with self._lock: self._is_cancelled = True if self._process is None or not self._process.is_running(): return logger.info('Terminating %s', self._process_cmd) self._process.terminate() for _ in polling_loop(10): with self._lock: if self._process is None or not self._process.is_running(): return if kill: break self._kill_process()
def while_not_sync_standby(self, func): """Runs specified action while trying to make sure that the node is not assigned synchronous standby status. Tags us as not allowed to be a sync standby as we are going to go away, if we currently are wait for leader to notice and pick an alternative one or if the leader changes or goes away we are also free. If the connection to DCS fails we run the action anyway, as this is only a hint. There is a small race window where this function runs between a master picking us the sync standby and publishing it to the DCS. As the window is rather tiny consequences are holding up commits for one cycle period we don't worry about it here.""" if not self.is_synchronous_mode() or self.patroni.nosync: return func() with self._member_state_lock: self._disable_sync += 1 try: if self.touch_member(): # Master should notice the updated value during the next cycle. We will wait double that, if master # hasn't noticed the value by then not disabling sync replication is not likely to matter. for _ in polling_loop(timeout=self.dcs.loop_wait * 2, interval=2): try: if not self.is_sync_standby(self.dcs.get_cluster()): break except DCSError: logger.warning( "Could not get cluster state, skipping synchronous standby disable" ) break logger.info( "Waiting for master to release us from synchronous standby" ) else: logger.warning( "Updating member state failed, skipping synchronous standby disable" ) return func() finally: with self._member_state_lock: self._disable_sync -= 1
def wait_for_port_open(self, postmaster, timeout): """Waits until PostgreSQL opens ports.""" for _ in polling_loop(timeout): if self.cancellable.is_cancelled: return False if not postmaster.is_running(): logger.error('postmaster is not running') self.set_state('start failed') return False isready = self.pg_isready() if isready != STATE_NO_RESPONSE: if isready not in [STATE_REJECT, STATE_RUNNING]: logger.warning("Can't determine PostgreSQL startup status, assuming running") return True logger.warning("Timed out waiting for PostgreSQL to start") return False
def toggle_pause(self, paused): from patroni.utils import polling_loop cluster = self.dcs.get_cluster() config = cluster.config.data.copy() if cluster.is_paused() == paused: return logger.error('Cluster is %spaused, can not continue', ('' if paused else 'not ')) config['pause'] = paused if not self.dcs.set_config_value( json.dumps(config, separators=(',', ':')), cluster.config.index): return logger.error('Failed to pause cluster, can not continue') self.paused = paused old = {m.name: m.index for m in cluster.members if m.api_url} ttl = cluster.config.data.get('ttl', self.dcs.ttl) for _ in polling_loop(ttl + 1): cluster = self.dcs.get_cluster() if all( m.data.get('pause', False) == paused for m in cluster.members if m.name in old): logger.info('Maintenance mode %s', ('enabled' if paused else 'disabled')) return True remaining = [ m.name for m in cluster.members if m.data.get('pause', False) != paused and m.name in old and old[m.name] != m.index ] if remaining: return logger.error( "%s members didn't recognized pause state after %s seconds", remaining, ttl)
def while_not_sync_standby(self, func): """Runs specified action while trying to make sure that the node is not assigned synchronous standby status. Tags us as not allowed to be a sync standby as we are going to go away, if we currently are wait for leader to notice and pick an alternative one or if the leader changes or goes away we are also free. If the connection to DCS fails we run the action anyway, as this is only a hint. There is a small race window where this function runs between a master picking us the sync standby and publishing it to the DCS. As the window is rather tiny consequences are holding up commits for one cycle period we don't worry about it here.""" if not self.is_synchronous_mode() or self.patroni.nosync: return func() with self._member_state_lock: self._disable_sync += 1 try: if self.touch_member(): # Master should notice the updated value during the next cycle. We will wait double that, if master # hasn't noticed the value by then not disabling sync replication is not likely to matter. for _ in polling_loop(timeout=self.dcs.loop_wait * 2, interval=2): try: if not self.is_sync_standby(self.dcs.get_cluster()): break except DCSError: logger.warning("Could not get cluster state, skipping synchronous standby disable") break logger.info("Waiting for master to release us from synchronous standby") else: logger.warning("Updating member state failed, skipping synchronous standby disable") return func() finally: with self._member_state_lock: self._disable_sync -= 1
def rsync_replica(config, desired_version, primary_ip, pid): from pg_upgrade import PostgresqlUpgrade from patroni.utils import polling_loop me = psutil.Process() # check that we are the child of postgres backend if me.parent().pid != pid and me.parent().parent().pid != pid: return 1 backend = psutil.Process(pid) if 'postgres' not in backend.name(): return 1 postgresql = PostgresqlUpgrade(config) if postgresql.get_cluster_version() == desired_version: return 0 if os.fork(): return 0 # Wait until the remote side will close the connection and backend process exits for _ in polling_loop(10): if not backend.is_running(): break else: logger.warning('Backend did not exit after 10 seconds') sysid = postgresql.sysid # remember old sysid if not postgresql.stop(block_callbacks=True): logger.error('Failed to stop the cluster before rsync') return 1 postgresql.switch_pgdata() update_configs(desired_version) env = os.environ.copy() env['RSYNC_PASSWORD'] = postgresql.config.replication['password'] if subprocess.call([ 'rsync', '--archive', '--delete', '--hard-links', '--size-only', '--omit-dir-times', '--no-inc-recursive', '--include=/data/***', '--include=/data_old/***', '--exclude=/data/pg_xlog/*', '--exclude=/data_old/pg_xlog/*', '--exclude=/data/pg_wal/*', '--exclude=/data_old/pg_wal/*', '--exclude=*', 'rsync://{0}@{1}:{2}/pgroot'.format( postgresql.name, primary_ip, RSYNC_PORT), os.path.dirname(postgresql.data_dir) ], env=env) != 0: logger.error('Failed to rsync from %s', primary_ip) postgresql.switch_back_pgdata() # XXX: rollback configs? return 1 conn_kwargs = { k: v for k, v in postgresql.config.replication.items() if v is not None } if 'username' in conn_kwargs: conn_kwargs['user'] = conn_kwargs.pop('username') # If restart Patroni right now there is a chance that it will exit due to the sysid mismatch. # Due to cleaned environment we can't always use DCS on replicas in this script, therefore # the good indicator of initialize key being deleted/updated is running primary after the upgrade. for _ in polling_loop(300): try: with postgresql.get_replication_connection_cursor( primary_ip, **conn_kwargs) as cur: cur.execute('IDENTIFY_SYSTEM') if cur.fetchone()[0] != sysid: break except Exception: pass # If the cluster was unpaused earlier than we restarted Patroni, it might have created # the recovery.conf file and tried (and failed) to start the cluster up using wrong binaries. # In case of upgrade to 12+ presence of PGDATA/recovery.conf will not allow postgres to start. # We remove the recovery.conf and restart Patroni in order to make sure it is using correct config. try: postgresql.config.remove_recovery_conf() except Exception: pass kill_patroni() try: postgresql.config.remove_recovery_conf() except Exception: pass return postgresql.cleanup_old_pgdata()
def do_upgrade(self): from patroni.utils import polling_loop if not self.upgrade_required: logger.info( 'Current version=%s, desired version=%s. Upgrade is not required', self.cluster_version, self.desired_version) return True if not (self.postgresql.is_running() and self.postgresql.is_leader()): return logger.error('PostgreSQL is not running or in recovery') cluster = self.dcs.get_cluster() if not self.sanity_checks(cluster): return False self._old_sysid = self.postgresql.sysid # remember old sysid logger.info('Cluster %s is ready to be upgraded', self.postgresql.scope) if not self.postgresql.prepare_new_pgdata(self.desired_version): return logger.error('initdb failed') try: self.postgresql.drop_possibly_incompatible_extensions() except Exception: return logger.error( 'Failed to drop possibly incompatible extensions') if not self.postgresql.pg_upgrade(check=True): return logger.error( 'pg_upgrade --check failed, more details in the %s_upgrade', self.postgresql.data_dir) try: self.postgresql.drop_possibly_incompatible_objects() except Exception: return logger.error('Failed to drop possibly incompatible objects') logging.info('Enabling maintenance mode') if not self.toggle_pause(True): return False logger.info('Doing a clean shutdown of the cluster before pg_upgrade') downtime_start = time.time() if not self.postgresql.stop(block_callbacks=True): return logger.error('Failed to stop the cluster before pg_upgrade') if self.replica_connections: from patroni.postgresql.misc import parse_lsn # Make sure we use the pg_controldata from the correct major version self.postgresql.set_bin_dir(self.cluster_version) controldata = self.postgresql.controldata() self.postgresql.set_bin_dir(self.desired_version) checkpoint_lsn = controldata.get('Latest checkpoint location') if controldata.get('Database cluster state' ) != 'shut down' or not checkpoint_lsn: return logger.error("Cluster wasn't shut down cleanly") checkpoint_lsn = parse_lsn(checkpoint_lsn) logger.info('Latest checkpoint location: %s', checkpoint_lsn) logger.info('Starting rsyncd') self.start_rsyncd() if not self.wait_for_replicas(checkpoint_lsn): return False if not (self.rsyncd.pid and self.rsyncd.poll() is None): return logger.error('Failed to start rsyncd') if self.replica_connections: logger.info('Executing CHECKPOINT on replicas %s', ','.join(self.replica_connections.keys())) pool = ThreadPool(len(self.replica_connections)) # Do CHECKPOINT on replicas in parallel with pg_upgrade. # It will reduce the time for shutdown and so downtime. results = pool.map_async(self.checkpoint, self.replica_connections.items()) pool.close() if not self.postgresql.pg_upgrade(): return logger.error('Failed to upgrade cluster from %s to %s', self.cluster_version, self.desired_version) self.postgresql.switch_pgdata() self.upgrade_complete = True logger.info('Updating configuration files') envdir = update_configs(self.desired_version) ret = True if self.replica_connections: # Check status of replicas CHECKPOINT and remove connections that are failed. pool.join() if results.ready(): for name, status in results.get(): if not status: ret = False self.replica_connections.pop(name) member = cluster.get_member(self.postgresql.name) if self.replica_connections: primary_ip = member.conn_kwargs().get('host') rsync_start = time.time() try: if not self.rsync_replicas(primary_ip): ret = False except Exception as e: logger.error('rsync failed: %r', e) ret = False logger.info('Rsync took %s seconds', time.time() - rsync_start) self.stop_rsyncd() time.sleep(2) # Give replicas a bit of time to switch PGDATA self.remove_initialize_key() kill_patroni() self.remove_initialize_key() time.sleep(1) for _ in polling_loop(10): if self.check_patroni_api(member): break else: logger.error( 'Patroni REST API on primary is not accessible after 10 seconds' ) logger.info('Starting the primary postgres up') for _ in polling_loop(10): try: result = self.request(member, 'post', 'restart', {}) logger.info(' %s %s', result.status, result.data.decode('utf-8')) if result.status < 300: break except Exception as e: logger.error('POST /restart failed: %r', e) else: logger.error('Failed to start primary after upgrade') logger.info('Upgrade downtime: %s', time.time() - downtime_start) # The last attempt to fix initialize key race condition cluster = self.dcs.get_cluster() if cluster.initialize == self._old_sysid: self.dcs.cancel_initialization() try: self.postgresql.update_extensions() except Exception as e: logger.error('Failed to update extensions: %r', e) # start analyze early analyze_thread = Thread(target=self.analyze) analyze_thread.start() if self.replica_connections: self.wait_replicas_restart(cluster) self.resume_cluster() analyze_thread.join() self.reanalyze() logger.info('Total upgrade time (with analyze): %s', time.time() - downtime_start) self.postgresql.bootstrap.call_post_bootstrap(self.config['bootstrap']) self.postgresql.cleanup_old_pgdata() if envdir: self.start_backup(envdir) return ret
def rsync_replicas(self, primary_ip): from patroni.utils import polling_loop logger.info('Notifying replicas %s to start rsync', ','.join(self.replica_connections.keys())) ret = True status = {} for name, (ip, cur) in self.replica_connections.items(): try: cur.execute("SELECT pg_catalog.pg_backend_pid()") pid = cur.fetchone()[0] # We use the COPY TO PROGRAM "hack" to start the rsync on replicas. # There are a few important moments: # 1. The script is started as a child process of postgres backend, which # is running with the clean environment. I.e., the script will not see # values of PGVERSION, SPILO_CONFIGURATION, KUBERNETES_SERVICE_HOST # 2. Since access to the DCS might not be possible with pass the primary_ip # 3. The desired_version passed explicitly to guaranty 100% match with the master # 4. In order to protect from the accidental "rsync" we pass the pid of postgres backend. # The script will check that it is the child of the very specific postgres process. cur.execute( "COPY (SELECT) TO PROGRAM 'nohup {0} /scripts/inplace_upgrade.py {1} {2} {3}'" .format(sys.executable, self.desired_version, primary_ip, pid)) conn = cur.connection cur.close() conn.close() except Exception as e: logger.error('COPY TO PROGRAM on %s failed: %r', name, e) status[name] = False ret = False for name in status.keys(): self.replica_connections.pop(name) logger.info('Waiting for replicas rsync to complete') status.clear() for _ in polling_loop(300): synced = True for name in self.replica_connections.keys(): feedback = os.path.join(self.rsyncd_feedback_dir, name) if name not in status and os.path.exists(feedback): with open(feedback) as f: status[name] = f.read().strip() if name not in status: synced = False if synced: break for name in self.replica_connections.keys(): result = status.get(name) if result is None: logger.error( 'Did not received rsync feedback from %s after 300 seconds', name) ret = False elif not result.startswith('0'): logger.error('Rsync on %s finished with code %s', name, result) ret = False return ret
def main(): from patroni.config import Config from patroni.utils import polling_loop from pg_upgrade import PostgresqlUpgrade config = Config() config['postgresql'].update({ 'callbacks': {}, 'pg_ctl_timeout': 3600 * 24 * 7 }) upgrade = PostgresqlUpgrade(config['postgresql']) bin_version = upgrade.get_binary_version() cluster_version = upgrade.get_cluster_version() if cluster_version == bin_version: return 0 logger.info('Cluster version: %s, bin version: %s', cluster_version, bin_version) assert float(cluster_version) < float(bin_version) logger.info('Trying to start the cluster with old postgres') if not upgrade.start_old_cluster(config['bootstrap'], cluster_version): raise Exception('Failed to start the cluster with old postgres') for _ in polling_loop(upgrade.config.get('pg_ctl_timeout'), 10): upgrade.reset_cluster_info_state() if upgrade.is_leader(): break logger.info('waiting for end of recovery of the old cluster') if not upgrade.bootstrap.call_post_bootstrap(config['bootstrap']): upgrade.stop(block_callbacks=True, checkpoint=False) raise Exception('Failed to run bootstrap.post_init') locale = upgrade.query('SHOW lc_collate').fetchone()[0] encoding = upgrade.query('SHOW server_encoding').fetchone()[0] initdb_config = [{'locale': locale}, {'encoding': encoding}] if upgrade.query( "SELECT current_setting('data_checksums')::bool").fetchone()[0]: initdb_config.append('data-checksums') logger.info( 'Dropping objects from the cluster which could be incompatible') try: upgrade.drop_possibly_incompatible_objects() except Exception: upgrade.stop(block_callbacks=True, checkpoint=False) raise logger.info('Doing a clean shutdown of the cluster before pg_upgrade') if not upgrade.stop(block_callbacks=True, checkpoint=False): raise Exception('Failed to stop the cluster with old postgres') logger.info('initdb config: %s', initdb_config) logger.info('Executing pg_upgrade') if not upgrade.do_upgrade(bin_version, initdb_config): raise Exception('Failed to upgrade cluster from {0} to {1}'.format( cluster_version, bin_version)) logger.info('Starting the cluster with new postgres after upgrade') if not upgrade.start(): raise Exception('Failed to start the cluster with new postgres') upgrade.analyze()
def do_upgrade(self): from patroni.utils import polling_loop if not self.upgrade_required: logger.info( 'Current version=%s, desired version=%s. Upgrade is not required', self.cluster_version, self.desired_version) return True if not (self.postgresql.is_running() and self.postgresql.is_leader()): return logger.error('PostgreSQL is not running or in recovery') cluster = self.dcs.get_cluster() if not self.sanity_checks(cluster): return False self._old_sysid = self.postgresql.sysid # remember old sysid logger.info('Cluster %s is ready to be upgraded', self.postgresql.scope) if not self.postgresql.prepare_new_pgdata(self.desired_version): return logger.error('initdb failed') if not self.postgresql.pg_upgrade(check=True): return logger.error( 'pg_upgrade --check failed, more details in the %s_upgrade', self.postgresql.data_dir) try: self.postgresql.drop_possibly_incompatible_objects() except Exception: return logger.error('Failed to drop possibly incompatible objects') logging.info('Enabling maintenance mode') if not self.toggle_pause(True): return False logger.info('Doing a clean shutdown of the cluster before pg_upgrade') downtime_start = time.time() if not self.postgresql.stop(block_callbacks=True): return logger.error('Failed to stop the cluster before pg_upgrade') if self.replica_connections: checkpoint_lsn = int(self.postgresql.latest_checkpoint_location()) logger.info('Latest checkpoint location: %s', checkpoint_lsn) logger.info('Starting rsyncd') self.start_rsyncd() if not self.wait_for_replicas(checkpoint_lsn): return False if not (self.rsyncd.pid and self.rsyncd.poll() is None): return logger.error('Failed to start rsyncd') if not self.postgresql.pg_upgrade(): return logger.error('Failed to upgrade cluster from %s to %s', self.cluster_version, self.desired_version) self.postgresql.switch_pgdata() self.upgrade_complete = True logger.info('Updating configuration files') envdir = update_configs(self.desired_version) member = cluster.get_member(self.postgresql.name) if self.replica_connections: primary_ip = member.conn_kwargs().get('host') rsync_start = time.time() try: ret = self.rsync_replicas(primary_ip) except Exception as e: logger.error('rsync failed: %r', e) ret = False logger.info('Rsync took %s seconds', time.time() - rsync_start) self.stop_rsyncd() time.sleep(2) # Give replicas a bit of time to switch PGDATA self.remove_initialize_key() kill_patroni() self.remove_initialize_key() time.sleep(1) for _ in polling_loop(10): if self.check_patroni_api(member): break else: logger.error( 'Patroni REST API on primary is not accessible after 10 seconds' ) logger.info('Starting the primary postgres up') for _ in polling_loop(10): try: result = self.request(member, 'post', 'restart', {}) logger.info(' %s %s', result.status, result.data.decode('utf-8')) if result.status < 300: break except Exception as e: logger.error('POST /restart failed: %r', e) else: logger.error('Failed to start primary after upgrade') logger.info('Upgrade downtime: %s', time.time() - downtime_start) try: self.postgresql.update_extensions() except Exception as e: logger.error('Failed to update extensions: %r', e) # start analyze early analyze_thread = Thread(target=self.analyze) analyze_thread.start() self.wait_replicas_restart(cluster) self.resume_cluster() analyze_thread.join() self.reanalyze() logger.info('Total upgrade time (with analyze): %s', time.time() - downtime_start) self.postgresql.bootstrap.call_post_bootstrap(self.config['bootstrap']) self.postgresql.cleanup_old_pgdata() if envdir: self.start_backup(envdir) return ret
def _wait_promote(self, wait_seconds): for _ in polling_loop(wait_seconds): data = self.controldata() if data.get('Database cluster state') == 'in production': return True
def test_polling_loop(self): self.assertEquals(list(polling_loop(0.001, interval=0.001)), [0])