def install_administrative_scripts(): scripts_dir = helpers.scripts_dir() logs_dir = helpers.logs_dir() helpers.makedirs(scripts_dir, mode=0o755) # The database backup script. Most of this is redundant now. source = os.path.join(hookenv.charm_dir(), 'scripts', 'pgbackup.py') destination = os.path.join(scripts_dir, 'dump-pg-db') with open(source, 'r') as f: helpers.write(destination, f.read(), mode=0o755) backups_dir = helpers.backups_dir() helpers.makedirs(backups_dir, mode=0o750, user='******', group='postgres') # Generate a wrapper that invokes the backup script for each # database. data = dict(logs_dir=logs_dir, scripts_dir=scripts_dir, # backups_dir probably should be deprecated in favour of # a juju storage mount. backups_dir=backups_dir) destination = os.path.join(helpers.scripts_dir(), 'pg_backup_job') templating.render('pg_backup_job.tmpl', destination, data, owner='root', group='postgres', perms=0o755) # Install the reaper scripts. script = 'pgkillidle.py' source = os.path.join(hookenv.charm_dir(), 'scripts', script) if (reactive.helpers.any_file_changed([source]) or not os.path.exists(source)): destination = os.path.join(scripts_dir, script) with open(source, 'r') as f: helpers.write(destination, f.read(), mode=0o755) if not os.path.exists(logs_dir): helpers.makedirs(logs_dir, mode=0o755, user='******', group='postgres') # Create the backups.log file used by the backup wrapper if it # does not exist, in order to trigger spurious alerts when a # unit is installed, per Bug #1329816. helpers.write(helpers.backups_log_path(), '', mode=0o644, user='******', group='postgres')
def update_pgpass(): leader = context.Leader() accounts = ["root", "postgres", "ubuntu"] for account in accounts: path = os.path.expanduser(os.path.join("~{}".format(account), ".pgpass")) content = "# Managed by Juju\n" "*:*:*:{}:{}".format( replication.replication_username(), leader.get("replication_password") ) helpers.write(path, content, mode=0o600, user=account, group=account)
def update_pgpass(): leader = context.Leader() accounts = ['root', 'postgres', 'ubuntu'] for account in accounts: path = os.path.expanduser(os.path.join('~{}'.format(account), '.pgpass')) content = ('# Managed by Juju\n' '*:*:*:{}:{}'.format(replication.replication_username(), leader.get('replication_password'))) helpers.write(path, content, mode=0o600, user=account, group=account)
def create_pg_ctl_conf(): contents = textwrap.dedent( """\ # Managed by Juju # Automatic pg_ctl configuration # This configuration file contains cluster specific options to be passed to # pg_ctl(1). pg_ctl_options = '-w -t 3600' """ ) helpers.write( postgresql.pg_ctl_conf_path(), contents, mode=0o644, user="******", group="postgres", ) reactive.set_flag("postgresql.cluster.pg_ctl_conf.created")
def write_metrics_cronjob(): config = hookenv.config() path = os.path.join(helpers.cron_dir(), 'juju-postgresql-metrics') # Validated in preflight.block_on_invalid_config() metrics_target = config['metrics_target'].strip() metrics_sample_interval = config['metrics_sample_interval'] reactive.remove_state('postgresql.metrics.needs_update') if not metrics_target: if os.path.exists(path): hookenv.log('Turning off metrics cronjob') os.unlink(path) return charm_dir = hookenv.charm_dir() statsd_host, statsd_port = metrics_target.split(':', 1) metrics_prefix = config['metrics_prefix'].strip() metrics_prefix = metrics_prefix.replace( "$UNIT", hookenv.local_unit().replace('.', '-').replace('/', '-')) # ensure script installed charm_script = os.path.join(charm_dir, 'files', 'metrics', 'postgres_to_statsd.py') script_path = os.path.join(helpers.scripts_dir(), 'postgres_to_statsd.py') with open(charm_script, 'r') as f: helpers.write(script_path, f.read(), mode=0o755) # write the crontab data = dict(interval=config['metrics_sample_interval'], script_path=script_path, metrics_prefix=metrics_prefix, metrics_sample_interval=metrics_sample_interval, statsd_host=statsd_host, statsd_port=statsd_port) templating.render('metrics_cronjob.template', charm_script, data, perms=0o644)
def write_metrics_cronjob(): config = hookenv.config() path = os.path.join(helpers.cron_dir(), "juju-postgresql-metrics") # Validated in preflight.block_on_invalid_config() metrics_target = config["metrics_target"].strip() metrics_sample_interval = config["metrics_sample_interval"] reactive.remove_state("postgresql.metrics.needs_update") if not metrics_target: if os.path.exists(path): hookenv.log("Turning off metrics cronjob") os.unlink(path) return charm_dir = hookenv.charm_dir() statsd_host, statsd_port = metrics_target.split(":", 1) metrics_prefix = config["metrics_prefix"].strip() metrics_prefix = metrics_prefix.replace("$UNIT", hookenv.local_unit().replace(".", "-").replace("/", "-")) # ensure script installed charm_script = os.path.join(charm_dir, "files", "metrics", "postgres_to_statsd.py") script_path = os.path.join(helpers.scripts_dir(), "postgres_to_statsd.py") with open(charm_script, "r") as f: helpers.write(script_path, f.read(), mode=0o755) # write the crontab data = dict( interval=config["metrics_sample_interval"], script_path=script_path, metrics_prefix=metrics_prefix, metrics_sample_interval=metrics_sample_interval, statsd_host=statsd_host, statsd_port=statsd_port, ) templating.render("metrics_cronjob.template", charm_script, data, perms=0o644)
def install_administrative_scripts(): scripts_dir = helpers.scripts_dir() logs_dir = helpers.logs_dir() helpers.makedirs(scripts_dir, mode=0o755) # The database backup script. Most of this is redundant now. source = os.path.join(hookenv.charm_dir(), "scripts", "pgbackup.py") destination = os.path.join(scripts_dir, "dump-pg-db") with open(source, "r") as f: helpers.write(destination, f.read(), mode=0o755) backups_dir = helpers.backups_dir() helpers.makedirs(backups_dir, mode=0o750, user="******", group="postgres") # Generate a wrapper that invokes the backup script for each # database. data = dict( logs_dir=logs_dir, scripts_dir=scripts_dir, # backups_dir probably should be deprecated in favour of # a juju storage mount. backups_dir=backups_dir, ) destination = os.path.join(helpers.scripts_dir(), "pg_backup_job") templating.render( "pg_backup_job.tmpl", destination, data, owner="root", group="postgres", perms=0o755, ) # Install the reaper scripts. script = "pgkillidle.py" source = os.path.join(hookenv.charm_dir(), "scripts", script) destination = os.path.join(scripts_dir, script) if reactive.helpers.any_file_changed([source]) or not os.path.exists(destination): with open(source, "r") as f: helpers.write(destination, f.read(), mode=0o755) if not os.path.exists(logs_dir): helpers.makedirs(logs_dir, mode=0o755, user="******", group="postgres") # Create the backups.log file used by the backup wrapper if it # does not exist, in order to trigger spurious alerts when a # unit is installed, per Bug #1329816. helpers.write( helpers.backups_log_path(), "", mode=0o644, user="******", group="postgres", ) reactive.set_state("postgresql.cluster.support-scripts")
def update_wal_e_env_dir(): '''Regenerate the envdir(1) environment used to drive WAL-E. We do this even if wal-e is not enabled to ensure we destroy any secrets potentially left around from when it was enabled. ''' config = hookenv.config() env = dict( # wal-e Swift creds SWIFT_AUTHURL=config.get('os_auth_url', ''), SWIFT_TENANT=config.get('os_tenant_name', ''), SWIFT_USER=config.get('os_username', ''), SWIFT_PASSWORD=config.get('os_password', ''), # wal-e AWS creds AWS_ACCESS_KEY_ID=config.get('aws_access_key_id', ''), AWS_SECRET_ACCESS_KEY=config.get('aws_secret_access_key', ''), # wal-e Azure cred WABS_ACCOUNT_NAME=config.get('wabs_account_name', ''), WABS_ACCESS_KEY=config.get('wabs_access_key', ''), # OpenStack creds for swift(1) cli tool OS_AUTH_URL=config.get('os_auth_url', ''), OS_USERNAME=config.get('os_username', ''), OS_PASSWORD=config.get('os_password', ''), OS_TENANT_NAME=config.get('os_tenant_name', ''), WALE_SWIFT_PREFIX='', WALE_S3_PREFIX='', WALE_WABS_PREFIX='') uri = config.get('wal_e_storage_uri', None) if uri: required_env = [] parsed_uri = urlparse(uri) if parsed_uri.scheme == 'swift': env['WALE_SWIFT_PREFIX'] = uri required_env = ['SWIFT_AUTHURL', 'SWIFT_TENANT', 'SWIFT_USER', 'SWIFT_PASSWORD'] elif parsed_uri.scheme == 's3': env['WALE_S3_PREFIX'] = uri required_env = ['AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'] elif parsed_uri.scheme == 'wabs': env['WALE_WABS_PREFIX'] = uri required_env = ['WABS_ACCOUNT_NAME', 'WABS_ACCESS_KEY'] else: hookenv.log('Invalid wal_e_storage_uri {}'.format(uri), ERROR) for env_key in required_env: if not env[env_key].strip(): hookenv.log('Missing {}'.format(env_key), ERROR) # Regenerate the envdir(1) environment recommended by WAL-E. # All possible keys are rewritten to ensure we remove old secrets. helpers.makedirs(wal_e_env_dir(), mode=0o750, user='******', group='postgres') for k, v in env.items(): helpers.write(os.path.join(wal_e_env_dir(), k), v.strip(), mode=0o640, user='******', group='postgres') reactive.set_state('postgresql.wal_e.configured')
def update_wal_e_env_dir(dirpath, storage_uri): """Regenerate the envdir(1) environment used to drive WAL-E. We do this even if wal-e is not enabled to ensure we destroy any secrets potentially left around from when it was enabled. """ config = hookenv.config() env = dict( # wal-e Swift creds SWIFT_AUTHURL=config.get("os_auth_url", ""), SWIFT_USER=config.get("os_username", ""), SWIFT_PASSWORD=config.get("os_password", ""), SWIFT_TENANT=config.get("os_tenant_name", ""), SWIFT_REGION=config.get("os_region_name", ""), SWIFT_AUTH_VERSION=config.get("os_identity_api_version", ""), SWIFT_USER_DOMAIN_NAME=config.get("os_user_domain_name", ""), SWIFT_PROJECT_NAME=config.get("os_project_name", ""), SWIFT_PROJECT_DOMAIN_NAME=config.get("os_project_domain_name", ""), # wal-e AWS creds AWS_ACCESS_KEY_ID=config.get("aws_access_key_id", ""), AWS_SECRET_ACCESS_KEY=config.get("aws_secret_access_key", ""), AWS_REGION=config.get("aws_region", ""), # wal-e Azure cred WABS_ACCOUNT_NAME=config.get("wabs_account_name", ""), WABS_ACCESS_KEY=config.get("wabs_access_key", ""), # OpenStack creds for swift(1) cli tool OS_AUTH_URL=config.get("os_auth_url", ""), OS_USERNAME=config.get("os_username", ""), OS_PASSWORD=config.get("os_password", ""), OS_TENANT_NAME=config.get("os_tenant_name", ""), OS_REGION_NAME=config.get("os_region_name", ""), OS_IDENTITY_API_VERSION=config.get("os_identity_api_version", ""), OS_USER_DOMAIN_NAME=config.get("os_user_domain_name", ""), OS_PROJECT_NAME=config.get("os_project_name", ""), OS_PROJECT_DOMAIN_NAME=config.get("os_project_domain_name", ""), WALE_SWIFT_PREFIX="", WALE_S3_PREFIX="", WALE_WABS_PREFIX="", ) uri = storage_uri if uri: required_env = [] parsed_uri = urlparse(uri) if parsed_uri.scheme == "swift": env["WALE_SWIFT_PREFIX"] = uri required_env = [ "SWIFT_AUTHURL", "SWIFT_USER", "SWIFT_PASSWORD", ] elif parsed_uri.scheme == "s3": env["WALE_S3_PREFIX"] = uri required_env = ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_REGION"] elif parsed_uri.scheme == "wabs": env["WALE_WABS_PREFIX"] = uri required_env = ["WABS_ACCOUNT_NAME", "WABS_ACCESS_KEY"] else: hookenv.log("Invalid wal_e_storage_uri {}".format(uri), ERROR) for env_key in required_env: if not env[env_key].strip(): hookenv.log("Missing {}".format(env_key), ERROR) # Regenerate the envdir(1) environment recommended by WAL-E. # All possible keys are rewritten to ensure we remove old secrets. helpers.makedirs(dirpath, mode=0o750, user="******", group="postgres") for k, v in env.items(): helpers.write( os.path.join(dirpath, k), v.strip(), mode=0o640, user="******", group="postgres", )
def wal_e_restore(): reactive.remove_state("action.wal-e-restore") params = hookenv.action_get() backup = params["backup-name"].strip().replace("-", "_") storage_uri = params["storage-uri"].strip() ship_uri = hookenv.config().get("wal_e_storage_uri") if storage_uri == ship_uri: hookenv.action_fail( "The storage-uri parameter is identical to " "the wal_e_storage_uri config setting. Your " "restoration source cannot be the same as the " "folder you are archiving too to avoid corrupting " "the backups." ) return if not params["confirm"]: m = "Recovery from {}.".format(storage_uri) if ship_uri: m += "\nContents of {} will be destroyed.".format(ship_uri) m += "\nExisting local database will be destroyed." m += "\nRerun action with 'confirm=true' to proceed." hookenv.action_set({"info": m}) return with tempfile.TemporaryDirectory(prefix="wal-e", suffix="envdir") as envdir: update_wal_e_env_dir(envdir, storage_uri) # Confirm there is a backup to restore backups = wal_e_list_backups(envdir) if not backups: hookenv.action_fail("No backups found at {}".format(storage_uri)) return if backup != "LATEST" and backup not in (b["name"] for b in backups): hookenv.action_fail("Backup {} not found".format(backup)) return # Shutdown PostgreSQL. Note we want this action to run synchronously, # so there is no opportunity to ask permission from the leader. If # there are other units cloning this database, those clone operations # will fail. Which seems preferable to blocking a recovery operation # in any case, because if we are doing disaster recovery we generally # want to do it right now. status_set("maintenance", "Stopping PostgreSQL for backup restoration") postgresql.stop() # Trash the existing database. Its dangerous to do this first, but # we probably need the space. data_dir = postgresql.data_dir() # May be a symlink for content in os.listdir(data_dir): cpath = os.path.join(data_dir, content) if os.path.isdir(cpath) and not os.path.islink(cpath): shutil.rmtree(cpath) else: os.remove(cpath) # WAL-E recover status_set("maintenance", "Restoring backup {}".format(backup)) wal_e_run(["backup-fetch", data_dir, backup], envdir=envdir) # Create recovery.conf to complete recovery is_master = reactive.is_state("postgresql.replication.is_master") standby_mode = "off" if is_master else "on" if params.get("target-time"): target_time = "recovery_target_time='{}'" "".format(params["target-time"]) else: target_time = "" target_action = "promote" if is_master else "shutdown" immediate = "" if is_master else "recovery_target='immediate'" helpers.write( postgresql.recovery_conf_path(), dedent( """\ # Managed by Juju. PITR in progress. standby_mode = {} restore_command='{}' recovery_target_timeline = {} recovery_target_action = {} {} {} """ ).format( standby_mode, wal_e_restore_command(envdir=envdir), params["target-timeline"], target_action, target_time, immediate, ), mode=0o600, user="******", group="postgres", ) # Avoid circular import. We could also avoid the import entirely # with a sufficiently complex set of handlers in the replication # module, but that seems to be a worse solution. Better to break # out this action into a separate module. from reactive.postgresql import replication if is_master: if ship_uri: # If master, trash the configured wal-e storage. This may # contain WAL and backups from the old cluster which will # conflict with the new cluster. Hopefully it does not # contain anything important, because we have no way to # prompt the user for confirmation. wal_e_run(["delete", "--confirm", "everything"]) # Then, wait for recovery and promotion. postgresql.start() con = postgresql.connect() cur = con.cursor() while True: if postgresql.has_version("10"): cur.execute( """SELECT pg_is_in_recovery(), pg_last_wal_replay_lsn()""" ) else: cur.execute( """SELECT pg_is_in_recovery(), pg_last_xlog_replay_location()""" ) in_rec, loc = cur.fetchone() if not in_rec: break status_set("maintenance", "Recovery at {}".format(loc)) time.sleep(10) else: # If standby, startup and wait for recovery to complete and # shutdown. status_set("maintenance", "Recovery") # Startup might shutdown immediately and look like a failure. postgresql.start(ignore_failure=True) # No recovery point status yet for standbys, as we would need # to handle connection failures when the DB shuts down. We # should do this. while postgresql.is_running(): time.sleep(5) replication.update_recovery_conf(follow=replication.get_master()) # Reactive handlers will deal with the rest of the cleanup. # eg. ensuring required users and roles exist replication.update_replication_states() reactive.remove_state("postgresql.cluster.configured") reactive.toggle_state("postgresql.cluster.is_running", postgresql.is_running()) reactive.remove_state("postgresql.nagios.user_ensured") reactive.remove_state("postgresql.replication.replication_user_created") reactive.remove_state("postgresql.client.published")
def update_nagios_pgpass(): leader = context.Leader() nagios_password = leader["nagios_password"] content = "*:*:*:{}:{}".format(nagios_username(), nagios_password) helpers.write(nagios_pgpass_path(), content, mode=0o600, user="******", group="nagios")
def update_nrpe_config(): update_nagios_pgpass() nrpe = NRPE() user = nagios_username() port = postgresql.port() nrpe.add_check( shortname="pgsql", description="Check pgsql", check_cmd="check_pgsql -P {} -l {}".format(port, user), ) # copy the check script which will run cronned as postgres user with open("scripts/find_latest_ready_wal.py") as fh: check_script = fh.read() check_script_path = "{}/{}".format(helpers.scripts_dir(), "find_latest_ready_wal.py") helpers.write(check_script_path, check_script, mode=0o755) # create an (empty) file with appropriate permissions for the above check_output_path = "/var/lib/nagios/postgres-wal-max-age.txt" if not os.path.exists(check_output_path): helpers.write(check_output_path, b"0\n", mode=0o644, user="******", group="postgres") # retrieve the threshold values from the charm config config = hookenv.config() check_warn_threshold = config["wal_archive_warn_threshold"] or 0 check_crit_threshold = config["wal_archive_crit_threshold"] or 0 check_cron_path = "/etc/cron.d/postgres-wal-archive-check" if check_warn_threshold and check_crit_threshold: # create the cron job to run the above check_cron = "*/2 * * * * postgres {}".format(check_script_path) helpers.write(check_cron_path, check_cron, mode=0o644) # copy the nagios plugin which will check the cronned output with open("scripts/check_latest_ready_wal.py") as fh: check_script = fh.read() check_script_path = "{}/{}".format("/usr/local/lib/nagios/plugins", "check_latest_ready_wal.py") helpers.write(check_script_path, check_script, mode=0o755) # write the nagios check definition nrpe.add_check( shortname="pgsql_stale_wal", description="Check for stale WAL backups", check_cmd="{} {} {}".format(check_script_path, check_warn_threshold, check_crit_threshold), ) if reactive.is_state("postgresql.replication.is_master"): # TODO: These should be calculated from the backup schedule, # which is difficult since that is specified in crontab format. warn_age = 172800 crit_age = 194400 backups_log = helpers.backups_log_path() nrpe.add_check( shortname="pgsql_backups", description="Check pgsql backups", check_cmd=("check_file_age -w {} -c {} -f {}" "".format(warn_age, crit_age, backups_log)), ) else: # Standbys don't do backups. We still generate a check though, # to ensure alerts get through to monitoring after a failover. nrpe.add_check( shortname="pgsql_backups", description="Check pgsql backups", check_cmd=r"check_dummy 0 standby_does_not_backup", ) nrpe.write() reactive.remove_state("postgresql.nagios.needs_update")