def run(self): """Reboot all Kafka nodes on a given cluster""" ask_confirmation( 'Please check the Grafana dashboard of the cluster and make sure that ' 'topic partition leaders are well balanced and that all brokers are up and running.' ) logger.info( 'Checking that all Kafka brokers are reported up by their systemd unit status.' ) self.kafka_brokers.run_sync('systemctl status kafka') logger.info('Checking if /etc/profile.d/kafka.sh can be sourced.') self.kafka_brokers.run_sync('source /etc/profile.d/kafka.sh') if self.sleep_before_pref_replica_election < 900: ask_confirmation( 'The sleep time between a node restart and kafka preferred-replica-election ' 'is less than 900 seconds. The broker needs some time to recover after a restart. ' 'Are you sure?') for host in self.kafka_brokers.hosts: logger.info('Starting reboot of kafka host %s', host) self.reboot_kafka_node(host) logger.info('Sleeping %s before next host', self.batch_sleep_seconds) sleep(self.batch_sleep_seconds) logger.info('All Kafka node reboots completed!')
def run(self): """Required by Spicerack API.""" ask_confirmation( 'If a config change is being rolled-out, please run puppet on all hosts ' 'before proceeding.') with self.icinga_hosts.downtimed(self.admin_reason, duration=timedelta(minutes=60)): logger.info("Depool and test on canary: %s", self.aqs_canary.hosts) self.aqs_canary.run_sync('depool', 'systemctl restart aqs') ask_confirmation('Please test aqs on the canary.') logger.info('Pool the canary back.') self.aqs_canary.run_sync('pool') aqs_lbconfig = self.remote.query_confctl( self.confctl, cluster=self.cluster, name=r'(?!' + self.aqs_canary.hosts[0] + ').*') logger.info('Restarting remaining daemons (one host at a time).') aqs_lbconfig.run('systemctl restart aqs', svc_to_depool=['aqs'], batch_size=1, max_failed_batches=2, batch_sleep=30.0) logger.info("All AQS service restarts completed!")
def _reboot(self, hosts: NodeSet) -> None: """Reboot a set of hosts with downtime Arguments: hosts (`NodeSet`): A list of hosts to reboot """ puppet = self._spicerack.puppet(hosts) icinga_hosts = self._spicerack.icinga_hosts(hosts.hosts) try: duration = timedelta(minutes=20) with icinga_hosts.downtimed(self.reason, duration=duration): reboot_time = datetime.utcnow() confirm_on_failure(hosts.reboot, batch_size=len(hosts)) hosts.wait_reboot_since(reboot_time, print_progress_bars=False) puppet.run(quiet=True) puppet.wait_since(reboot_time) icinga_hosts.wait_for_optimal() self.results.success(hosts.hosts) except IcingaError as error: ask_confirmation(f'Failed to downtime hosts: {error}') self.logger.warning(error) except AbortError as error: # Some host failed to come up again, or something fundamental broke. # log an error, continue *without* repooling self.logger.error(error) self.logger.error( 'Error rebooting: Hosts %s, they may still be depooled', hosts) self.results.fail(hosts.hosts) raise
def __init__(self, args, spicerack): """Initialize an Hadoop worker.""" self.success_percent_cumin = args.success_percent / 100 self.skip_disks = args.skip_disks self.disks_number = args.disks_number self.hostname_pattern = args.hostname_pattern self.partitions_basedir = args.partitions_basedir self.wipe_partitions = args.wipe_partitions self.hadoop_workers = spicerack.remote().query(self.hostname_pattern) letters = list(string.ascii_lowercase) if len(letters[self.skip_disks:]) < self.disks_number: raise RuntimeError( 'The number of available letters is not enough to support {} disks, ' 'please check your parameters:\n{}'.format( self.disks_number, letters[self.skip_disks:])) self.available_disk_labels = letters[self. skip_disks:self.disks_number + self.skip_disks] ask_confirmation( 'Please check that the hosts to initialize are the expected ones: {}' .format(self.hadoop_workers.hosts)) ask_confirmation( 'Please check that the disk labels to act on are the expected ' 'ones: {}'.format(str(self.available_disk_labels))) ensure_shell_is_durable()
def __init__(self, args, spicerack): """Change Hadoop distribution on all the clients of a given cluster""" if args.cluster == 'test': cumin_labels = HADOOP_TEST_CLIENT_CUMIN_ALIASES elif args.cluster == 'analytics': cumin_labels = HADOOP_CLIENT_CUMIN_ALIASES else: raise RuntimeError("Hadoop cluster {} not supported.".format(args.cluster)) ensure_shell_is_durable() spicerack_remote = spicerack.remote() if args.cumin_client_label: if args.cumin_client_label not in cumin_labels: raise RuntimeError( "Cumin label {} not supported. Please use one of: {}" .format(args.cumin_client_label, cumin_labels)) cumin_labels = [args.cumin_client_label] self.hadoop_client_hosts = spicerack_remote.query(' or '.join(cumin_labels)) self.icinga_hosts = spicerack.icinga_hosts(self.hadoop_client_hosts.hosts) self.admin_reason = spicerack.admin_reason('Change Hadoop distribution') self.rollback = args.rollback self.cluster = args.cluster ask_confirmation( "This cookbook assumes that the Hadoop cluster runs already the new distro, " "please do not proceed otherwise.")
def check_patterns_in_repo(host_paths, patterns): """Git grep for all the given patterns in the given hosts and path and ask for confirmation if any is found. Arguments: host_paths (sequence): a sequence of 2-item tuples with the RemoteHost instance and the path of the repositories to check. patterns (sequence): a sequence of patterns to check. """ grep_command = "git -C '{{path}}' grep -E '({patterns})'".format( patterns='|'.join(patterns)) ask = False for remote_host, path in host_paths: logger.info('Looking for matches in %s:%s', remote_host, path) for _nodeset, _output in remote_host.run_sync( Command(grep_command.format(path=path), ok_codes=[])): ask = True if ask: ask_confirmation( 'Found match(es) in the Puppet or mediawiki-config repositories ' '(see above), proceed anyway?') else: logger.info( 'No matches found in the Puppet or mediawiki-config repositories')
def run(self): """Required by Spicerack API.""" ask_confirmation(self.message) try: self.remote_hosts.run_sync(self.command) except RemoteExecutionError: logger.error('Some hosts were not reachable, see the list above. Typically caused by hardware maintenance.')
def test_ask_confirmation_abort(mocked_isatty, mocked_input): """Calling ask_confirmation() should raise AbortError if 'abort' is provided.""" mocked_isatty.return_value = True mocked_input.return_value = 'abort' message = 'Test message' with pytest.raises(interactive.AbortError, match='Confirmation manually aborted'): interactive.ask_confirmation(message)
def test_ask_confirmation_go(mocked_isatty, mocked_input, capsys): """Calling ask_confirmation() should not raise if the correct answer is provided.""" mocked_isatty.return_value = True mocked_input.return_value = 'go' message = 'Test message' interactive.ask_confirmation(message) out, _ = capsys.readouterr() assert message in out
def run(self): """Restart all Hadoop jvm daemons on a given cluster""" with self.icinga_hosts.downtimed(self.admin_reason, duration=timedelta(minutes=120)): logger.info("Restarting Yarn Resourcemanager on Master.") self.hadoop_master.run_sync('systemctl restart hadoop-yarn-resourcemanager') logger.info("Sleeping %s seconds.", self.yarn_rm_sleep) time.sleep(self.yarn_rm_sleep) logger.info("Restarting Yarn Resourcemanager on Standby.") self.hadoop_standby.run_sync('systemctl restart hadoop-yarn-resourcemanager') print_hadoop_service_state( self.hadoop_master, self.hadoop_master_service, self.hadoop_standby_service, hdfs=False) ask_confirmation("Ok to proceed with HDFS Namenodes ?") logger.info("Run manual HDFS failover from master to standby.") run_hdfs_namenode_failover(self.hadoop_master, self.hadoop_master_service, self.hadoop_standby_service) logger.info("Sleeping 30 seconds.") time.sleep(30) logger.info("Restart HDFS Namenode on the master.") self.hadoop_master.run_async( 'systemctl restart hadoop-hdfs-zkfc', 'systemctl restart hadoop-hdfs-namenode') logger.info("Sleeping %s seconds.", self.hdfs_nn_sleep) time.sleep(self.hdfs_nn_sleep) print_hadoop_service_state( self.hadoop_master, self.hadoop_master_service, self.hadoop_standby_service, yarn=False) ask_confirmation("Ok to proceed?") logger.info("Run manual HDFS failover from standby to master.") run_hdfs_namenode_failover(self.hadoop_master, self.hadoop_standby_service, self.hadoop_master_service) logger.info("Sleeping 30 seconds.") time.sleep(30) logger.info("Restart HDFS Namenode on the standby.") self.hadoop_standby.run_async( 'systemctl restart hadoop-hdfs-zkfc', 'systemctl restart hadoop-hdfs-namenode') logger.info("Sleeping %s seconds.", self.hdfs_nn_sleep) time.sleep(self.hdfs_nn_sleep) logger.info("\n\nSummary of active/standby statuses after the restarts:") print_hadoop_service_state( self.hadoop_master, self.hadoop_master_service, self.hadoop_standby_service) logger.info("Restart MapReduce historyserver on the master.") self.hadoop_master.run_sync('systemctl restart hadoop-mapreduce-historyserver')
def __init__(self, args, spicerack): """Initialize the runner""" if args.cluster == 'test': self.cluster_cumin_alias = 'A:hadoop-worker-test' self.hdfs_jn_cumin_alias = 'A:hadoop-hdfs-journal-test' elif args.cluster == 'analytics': self.cluster_cumin_alias = 'A:hadoop-worker' self.hdfs_jn_cumin_alias = 'A:hadoop-hdfs-journal' else: raise RuntimeError("Hadoop cluster {} not supported.".format( args.cluster)) ensure_shell_is_durable() self.cluster = args.cluster self.hadoop_workers = spicerack.remote().query( self.cluster_cumin_alias) self.hadoop_hdfs_journal_workers = spicerack.remote().query( self.hdfs_jn_cumin_alias) self.icinga_hosts = spicerack.icinga_hosts(self.hadoop_workers.hosts) self.admin_reason = spicerack.admin_reason( 'Roll restart of jvm daemons for openjdk upgrade.') self.yarn_nm_batch_size = args.yarn_nm_batch_size self.yarn_nm_sleep = args.yarn_nm_sleep_seconds # Not configurable on purpose, too risky! self.hdfs_jn_batch_size = 1 self.hdfs_jn_sleep = args.hdfs_jn_sleep_seconds self.hdfs_dn_batch_size = args.hdfs_dn_batch_size self.hdfs_dn_sleep = args.hdfs_dn_sleep_seconds # Safety checks if self.hdfs_dn_batch_size > 5: ask_confirmation( 'The HDFS Datanode batch size is bigger than 5, are you sure?') if self.hdfs_dn_sleep < 20: ask_confirmation( 'The HDFS Datanode sleep between each batch is less than 20s, are you sure?' ) if self.hdfs_jn_sleep < 20: ask_confirmation( 'The HDFS Journalnode sleep between each batch is less than 20s, are you sure?' ) if self.yarn_nm_batch_size > 10: ask_confirmation( 'The Yarn Nodemanager batch size is bigger than 10, are you sure?' ) if self.yarn_nm_sleep < 20: ask_confirmation( 'The Yarn Nodemanager sleep between each batch is less than 20s, are you sure?' )
def __init__(self, args, spicerack): """Change Hadoop distribution on a given cluster""" if args.cluster == 'test': suffix = '-test' elif args.cluster == 'analytics': suffix = '' else: raise RuntimeError("Hadoop cluster {} not supported.".format(args.cluster)) ensure_shell_is_durable() spicerack_remote = spicerack.remote() self.hadoop_hosts = spicerack_remote.query(CLUSTER_CUMIN_ALIAS + suffix) self.hadoop_hdfs_journal_workers = spicerack_remote.query(HDFS_JOURNAL_CUMIN_ALIAS + suffix) if args.journalnodes_cumin_query: hadoop_hdfs_journal_override = spicerack_remote.query(args.journalnodes_cumin_query) self.hadoop_hdfs_journal_workers = spicerack_remote.query( "D{{{}}}".format( self.hadoop_hdfs_journal_workers.hosts.intersection(hadoop_hdfs_journal_override.hosts))) ask_confirmation( 'The cookbook will run only on the following journal hosts ({}), please verify that ' 'the list looks correct: {}' .format(len(self.hadoop_hdfs_journal_workers), self.hadoop_hdfs_journal_workers)) self.hadoop_workers = spicerack_remote.query(WORKERS_CUMIN_ALIAS + suffix) if args.workers_cumin_query: hadoop_workers_override = spicerack_remote.query(args.workers_cumin_query) self.hadoop_workers = spicerack_remote.query( "D{{{}}}".format(self.hadoop_workers.hosts.intersection(hadoop_workers_override.hosts))) ask_confirmation( 'The cookbook will run only on the following worker hosts ({}), please verify that ' 'the list looks correct: {}' .format(len(self.hadoop_workers), self.hadoop_workers)) self.hadoop_master = spicerack_remote.query(MASTER_CUMIN_ALIAS + suffix) self.hadoop_standby = spicerack_remote.query(STANDBY_CUMIN_ALIAS + suffix) self.icinga_hosts = spicerack.icinga_hosts(self.hadoop_hosts.hosts) self.admin_reason = spicerack.admin_reason('Change Hadoop distribution') self.rollback = args.rollback self.cluster = args.cluster self.apt_install_options = '-y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold"' # Workaround needed for https://issues.apache.org/jira/browse/YARN-8310 self.yarn_metadata_cleanup_commands = [ f'setAcl /yarn-rmstore/analytics{suffix}-hadoop/ZKRMStateRoot world:anyone:cdrwa', f'rmr /yarn-rmstore/analytics{suffix}-hadoop/ZKRMStateRoot']
def _install_os(self): """Perform the OS reinstall.""" pxe_reboot_time = datetime.utcnow() self.ipmi.force_pxe() self.host_actions.success('Forced PXE for next reboot') self.ipmi.reboot() self.host_actions.success('Host rebooted via IPMI') self.remote_installer.wait_reboot_since(pxe_reboot_time, print_progress_bars=False) time.sleep( 30 ) # Avoid race conditions, the host is in the d-i, need to wait anyway di_reboot_time = datetime.utcnow() env_command = 'grep -q "BOOT_IMAGE=debian-installer" /proc/cmdline' try: self.remote_installer.run_sync(env_command, print_output=False, print_progress_bars=False) except RemoteExecutionError: ask_confirmation( 'Unable to verify that the host is inside the Debian installer, please verify manually ' f'with: sudo install_console {self.fqdn}') self.host_actions.success('Host up (Debian installer)') self.remote_installer.wait_reboot_since(di_reboot_time, print_progress_bars=False) try: self.remote_installer.run_sync(f'! {env_command}', print_output=False, print_progress_bars=False) except RemoteExecutionError: ask_confirmation( 'Unable to verify that the host rebooted into the new OS, it might still be into the ' f'Debian installer, please verify manually with: sudo install_console {self.fqdn}' ) result = self.remote_installer.run_sync('lsb_release -sc', print_output=False, print_progress_bars=False) for _, output in result: distro = output.message().decode() if distro != self.args.os: message = f'New OS is {distro} but {self.args.os} was requested' self.host_actions.failure(message) raise RuntimeError(message) self.host_actions.success(f'Host up (new fresh {distro} OS)')
def __init__(self, args, spicerack): """Decommission a host from all inventories.""" ensure_shell_is_durable() self.remote = spicerack.remote() try: self.decom_hosts = self.remote.query(args.query).hosts except RemoteError: logger.debug("Query '%s' did not match any host or failed", args.query, exc_info=True) decom_hosts = NodeSet(args.query) ask_confirmation( 'ATTENTION: the query does not match any host in PuppetDB or failed\n' 'Hostname expansion matches {n} hosts: {hosts}\n' 'Do you want to proceed anyway?'.format(n=len(decom_hosts), hosts=decom_hosts)) self.decom_hosts = decom_hosts if len(self.decom_hosts) > 20: raise RuntimeError( 'Matched {} hosts, aborting. (max 20 with --force, 5 without)'. format(len(self.decom_hosts))) if len(self.decom_hosts) > 5: if args.force: logger.info( 'Authorized decommisioning of %s hosts with --force', len(self.decom_hosts)) else: raise RuntimeError( 'Matched {} hosts, and --force not set aborting. (max 20 with --force, 5 without)' .format(len(self.decom_hosts))) ask_confirmation( 'ATTENTION: destructive action for {n} hosts: {hosts}\nAre you sure to proceed?' .format(n=len(self.decom_hosts), hosts=self.decom_hosts)) self.spicerack = spicerack self.task_id = args.task_id self.puppet_master = self.remote.query(get_puppet_ca_hostname()) self.kerberos_kadmin = self.remote.query(KERBEROS_KADMIN_CUMIN_ALIAS) self.dns = self.spicerack.dns() self.deployment_host = self.remote.query( self.dns.resolve_cname(DEPLOYMENT_HOST)) self.patterns = get_grep_patterns(self.dns, self.decom_hosts) self.reason = self.spicerack.admin_reason('Host decommission', task_id=self.task_id)
def run(args, spicerack): """Required by Spicerack API.""" post_process_args(args) if args.live_test: logger.info('Inverting DC to perform the warmup in %s (passive DC)', args.dc_from) datacenter = args.dc_from else: datacenter = args.dc_to ask_confirmation( 'Are you sure to warmup caches in {dc}?'.format(dc=datacenter)) warmup_dir = '/var/lib/mediawiki-cache-warmup' # urls-cluster is only running against appservers since is for shared resources behind the # servers themselves warmups = [ "nodejs {dir}/warmup.js {dir}/urls-cluster.txt spread appservers.svc.{dc}.wmnet" .format(dir=warmup_dir, dc=datacenter) ] for cluster in ["appserver", "api_appserver"]: # urls-server runs against both appserver and API clusters since it's for each individual server warmups.append( "nodejs {dir}/warmup.js {dir}/urls-server.txt clone {cluster} {dc}" .format(dir=warmup_dir, dc=datacenter, cluster=cluster)) maintenance_host = spicerack.mediawiki().get_maintenance_host(datacenter) # It takes multiple executions of the warmup script to fully warm up the appserver caches. The second run is faster # than the first, and so on. Empirically, we consider the caches to be fully warmed up when this speedup disappears; # that is, when the execution time converges, and each attempt takes about as long as the one before. logger.info('Running warmup script in %s.', datacenter) logger.info('The script will re-run until execution time converges.') last_duration = datetime.timedelta.max for i in itertools.count(1): logger.info('Running warmup script, take %d', i) start_time = datetime.datetime.utcnow() maintenance_host.run_sync(*warmups) duration = datetime.datetime.utcnow() - start_time logger.info('Warmup completed in %s', duration) # After we've done a minimum number of iterations, we stop looping as soon as the warmup script takes more # than 95% as long as the previous run. That is, keep looping as long as it keeps going faster than before, # but with a 5% margin of error. At that point, any further reduction is probably just noise. if i >= MINIMUM_ITERATIONS and duration > 0.95 * last_duration: break last_duration = duration logger.info('Execution time converged, warmup complete.')
def __init__(self, args, spicerack): """Initialize the runner""" ensure_shell_is_durable() self.cluster_cumin_alias = "A:zookeeper-" + args.cluster self.zookeeper = spicerack.remote().query(self.cluster_cumin_alias) self.icinga_hosts = spicerack.icinga_hosts(self.zookeeper.hosts) self.admin_reason = spicerack.admin_reason( 'Roll restart of jvm daemons.') self.batch_sleep_seconds = args.batch_sleep_seconds # Safety checks self.zookeeper.run_sync('echo stats | nc -q 1 localhost 2181') logger.info('\n=========================================\n') ask_confirmation( 'Please check the status of Zookeeper before proceeding.' 'There must be only one leader and the rest must be followers.')
def run(self): """Reboot all Hadoop workers of a given cluster""" if self.workers_cumin_query: hadoop_workers = self.spicerack_remote.query(self.cluster_cumin_alias) hadoop_workers_override = self.spicerack_remote.query(self.workers_cumin_query) hadoop_workers = self.spicerack_remote.query( "D{{{}}}".format(hadoop_workers.hosts.intersection(hadoop_workers_override.hosts))) ask_confirmation( 'The user chose to limit the number of Hadoop workers to reboot. ' 'This option does not care about Journal nodes and it will only reboot ' 'hosts following the batch size ({}). This means that more than one Journal node ' 'may potentially be rebooted at the same time. Please check the list of hosts ({}) ' 'before proceeding: {}'.format(self.reboot_batch_size, len(hadoop_workers), hadoop_workers)) worker_hostnames_n_slices = math.floor(len(hadoop_workers.hosts) / self.reboot_batch_size) logger.info('Rebooting Hadoop workers') for hadoop_workers_batch in hadoop_workers.split(worker_hostnames_n_slices): logger.info("Currently processing: %s", hadoop_workers_batch.hosts) self._reboot_hadoop_workers(hadoop_workers_batch) else: # The test cluster have few worker nodes, all running HDFS Datanodes # and Journalnodes, so we need a simpler procedure for this use case. if self.cluster != 'test': hadoop_workers_no_journal = self.spicerack_remote.query( self.cluster_cumin_alias + ' and not ' + self.hdfs_jn_cumin_alias) # Split the workers into batches of hostnames worker_hostnames_n_slices = math.floor(len(hadoop_workers_no_journal.hosts) / self.reboot_batch_size) logger.info('Rebooting Hadoop workers NOT running a HDFS Journalnode') for hadoop_workers_batch in hadoop_workers_no_journal.split(worker_hostnames_n_slices): logger.info("Currently processing: %s", hadoop_workers_batch.hosts) self._reboot_hadoop_workers(hadoop_workers_batch) logger.info('Rebooting Hadoop workers running a HDFS Journalnode') # Using the following loop to iterate over every HDFS JournalNode # one at the time. hadoop_hdfs_journal_workers = self.spicerack_remote.query(self.hdfs_jn_cumin_alias) for hadoop_workers_batch in hadoop_hdfs_journal_workers.split(len(hadoop_hdfs_journal_workers.hosts)): logger.info("Currently processing: %s", hadoop_workers_batch.hosts) self._reboot_hadoop_workers(hadoop_workers_batch, stop_journal_daemons=True) logger.info('All reboots completed!')
def _restart_daemons(self, hosts: RemoteHosts) -> None: """Restart daemons on a set of hosts with downtime Arguments: hosts (`RemoteHosts`): A list of hosts to action """ systemd_cmd = '/bin/systemctl' if self._args.ignore_restart_errors: # Only restart services which are active restart_cmds = [ f'{systemd_cmd} --quiet is-active {daemon} && {systemd_cmd} restart {daemon} || /bin/true' for daemon in self.restart_daemons ] else: restart_cmds = [ f"{systemd_cmd} restart {' '.join(self.restart_daemons)}" ] puppet = self._spicerack.puppet(hosts) icinga_hosts = self._spicerack.icinga_hosts(hosts.hosts) try: duration = timedelta(minutes=20) with icinga_hosts.downtimed(self.reason, duration=duration): now = datetime.utcnow() confirm_on_failure(hosts.run_sync, *restart_cmds) puppet.run(quiet=True) puppet.wait_since(now) icinga_hosts.wait_for_optimal() self.results.success(hosts.hosts) except IcingaError as error: ask_confirmation(f'Failed to dowtime hosts: {error}') self.logger.warning(error) except AbortError as error: # Some host failed to come up again, or something fundamental broke. # log an error, exit *without* repooling self.logger.error(error) self.logger.error( 'Error restarting daemons on: Hosts %s, they may still be depooled', hosts, ) self.results.fail(hosts.hosts) raise
def __init__(self, args, spicerack): """Initialize the runner.""" self.duration = timedelta(days=args.days, hours=args.hours, minutes=args.minutes) if args.force: self.hosts = NodeSet(args.query) ask_confirmation( f'Will downtime {len(self.hosts)} unverified hosts: {self.hosts}' ) else: self.hosts = spicerack.remote().query(args.query).hosts if not self.hosts: raise RuntimeError( f'No host found for query "{args.query}". Use --force targeting Icinga hosts that ' 'are not real hosts.') self.task_id = args.task_id self.icinga_hosts = spicerack.icinga_hosts(self.hosts, verbatim_hosts=args.force) self.reason = spicerack.admin_reason(args.reason, task_id=args.task_id) if args.force_puppet: self.puppet = spicerack.puppet(spicerack.icinga_master_host) else: self.puppet = None if args.task_id is not None: self.phabricator = spicerack.phabricator( PHABRICATOR_BOT_CONFIG_FILE) else: self.phabricator = None if len(self.hosts) <= 5: hosts_message = str(self.hosts) else: hosts_message = f'{len(self.hosts)} hosts' self.short_message = f'for {self.duration} on {hosts_message} with reason: {args.reason}' self.long_message = ( f'Icinga downtime set by {self.reason.owner} for {self.duration} {len(self.hosts)} ' f'host(s) and their services with reason: {args.reason}\n```\n{self.hosts}\n```' )
def _update_local(self, working_dir: Path, message: str) -> None: """Update the repo with data from fetch_data. Arguments: working_dir (pathlib.Path): The temporary directory used to build diffs message (str): the commit message """ repo_dir = working_dir / "repo" data_dir = working_dir / self._data_subdir working_repo = self._repo.clone(repo_dir) # Delete all existing files to ensure removal of stale data working_repo.git.rm("./", r=True, ignore_unmatch=True) # TODO: on python >= 3.8 we can use shutil.copytree with dirs_exist_ok=True copy_tree(str(data_dir), str(repo_dir), preserve_symlinks=1) self._commit(working_repo, message) print(working_repo.git.show(["--color=always", "HEAD"])) ask_confirmation(f"Ok to push changes to {self._repo.common_dir}") self._push(working_repo)
def run(self): """Reboot all Druid nodes in a given cluster""" self.druid_workers.run_async( 'systemctl --quiet is-active zookeeper && echo stats | nc localhost 2181 | grep Mode || exit 0' ) ask_confirmation( 'From the output of the last command, please check the status of the' ' Zookeeper cluster before proceeding.' ' There must be only one leader and the rest must be followers.') for host in self.druid_workers.hosts: logger.info('Start reboot of druid node %s', host) self.reboot_druid_node(host) logger.info( 'Reboot completed for node %s. Waiting 10 minutes for daemons to catch up', host) sleep(600) logger.info('All Druid node reboots completed!')
def __init__(self, args, spicerack): """Initialize the runner.""" if args.force: self.hosts = NodeSet(args.query) ask_confirmation( f'Will remove downtime for {len(self.hosts)} unverified hosts: {self.hosts}' ) else: self.hosts = spicerack.remote().query(args.query).hosts if not self.hosts: raise RuntimeError( f'No host found for query "{args.query}". Use --force targeting Icinga hosts that ' 'are not real hosts.') self.icinga_hosts = spicerack.icinga_hosts(self.hosts, verbatim_hosts=args.force) if len(self.hosts) <= 5: self.hosts_message = str(self.hosts) else: self.hosts_message = f'{len(self.hosts)} hosts'
def __init__(self, args, spicerack): """Initialize the runner.""" ensure_shell_is_durable() self.cluster_cumin_alias = "A:kafka-" + args.cluster self.kafka_brokers = spicerack.remote().query(self.cluster_cumin_alias) self.icinga_hosts = spicerack.icinga_hosts(self.kafka_brokers.hosts) self.admin_reason = spicerack.admin_reason( 'Roll restart of jvm daemons for openjdk upgrade.') self.batch_sleep_seconds = args.batch_sleep_seconds self.sleep_before_pref_replica_election = args.sleep_before_pref_replica_election ask_confirmation( 'Please check the Grafana dashboard of the cluster and make sure that ' 'topic partition leaders are well balanced and that all brokers are up and running.' ) if args.sleep_before_pref_replica_election < 900: ask_confirmation( 'The sleep time between a broker restart and kafka preferred-replica-election ' 'is less than 900 seconds. The broker needs some time to recover after a restart. ' 'Are you sure?')
def __init__(self, args, spicerack): """Create a new Virtual Machine in Ganeti.""" self.cluster, self.row, self.datacenter = get_locations()[args.location] self.hostname = args.hostname self.vcpus = args.vcpus self.memory = args.memory self.network = args.network self.disk = args.disk self.skip_v6 = args.skip_v6 self.spicerack = spicerack self.netbox = self.spicerack.netbox(read_write=True) self.fqdn = make_fqdn(self.hostname, self.network, self.datacenter) self.allocated = [] # Store allocated IPs to rollback them on failure self.dns_propagated = False # Whether to run the DNS cookbook on rollback self.need_netbox_sync = False # Whether to sync the VM to Netbox on rollback print('Ready to create Ganeti VM {a.fqdn} in the {a.cluster} cluster on row {a.row} with {a.vcpus} vCPUs, ' '{a.memory}GB of RAM, {a.disk}GB of disk in the {a.network} network.'.format(a=self)) ask_confirmation('Is this correct?') ensure_shell_is_durable()
def run(self): """Change the Hadoop distribution.""" with self.icinga_hosts.downtimed(self.admin_reason, duration=timedelta(minutes=30)): if not self.rollback: logger.info( 'Saving a snapshot of cdh package names and versions in /root/cdh_package_list ' 'on all nodes, and removing all packages.') confirm_on_failure( self.hadoop_client_hosts.run_sync, "dpkg -l | awk '/ii.*+cdh/ {print $2\" \"}' > /root/cdh_package_list") self._remove_packages() confirm_on_failure(self.hadoop_client_hosts.run_async, 'apt-get update') confirm_on_failure( self.hadoop_client_hosts.run_sync, 'apt-cache policy hadoop | grep Candidate') ask_confirmation('Please verify that the candidate hadoop package is correct across all nodes.') self._install_packages_on_clients() logger.info('The procedure is completed.')
def _install_packages_on_workers(self): """Install Hadoop packages on Hadoop worker nodes.""" logger.info("Install packages on worker nodes (long step).") if self.rollback: confirm_on_failure( self.hadoop_workers.run_sync, 'apt-get install -y `cat /root/cdh_package_list`', batch_size=5, batch_sleep=60.0, success_threshold=HADOOP_WORKERS_CUMIN_SUCCESS_THRESHOLD) else: apt_package_filter = "|".join(CDH_PACKAGES_NOT_IN_BIGTOP) confirm_on_failure( self.hadoop_workers.run_sync, "apt-get install -y `cat /root/cdh_package_list | tr ' ' '\n' | " f"egrep -v '{apt_package_filter}' | tr '\n' ' '`", batch_size=5, batch_sleep=60.0, success_threshold=HADOOP_WORKERS_CUMIN_SUCCESS_THRESHOLD) # If the cookbook is running in rollback mode, then there are extra steps to be taken # for HDFS Datanodes. if self.rollback: logger.info('Stop each datanode and start it with the rollback option. Long step.') confirm_on_failure( self.hadoop_workers.run_async, 'systemctl unmask hadoop-hdfs-datanode', 'service hadoop-hdfs-datanode rollback', batch_size=2, batch_sleep=30.0, success_threshold=HADOOP_WORKERS_CUMIN_SUCCESS_THRESHOLD) logger.info('Checking how many java daemons are running on the worker nodes ' 'after installing the packages.') confirm_on_failure( self.hadoop_workers.run_sync, 'ps aux | egrep "[j]ava.*(JournalNode|DataNode|NodeManager)" | wc -l', success_threshold=HADOOP_WORKERS_CUMIN_SUCCESS_THRESHOLD) ask_confirmation('Verify that the count is two for non-journal workers, and 3 for journal workers.')
def run(self): """Add a new node to a Ganeti cluster.""" print('Ready to add Ganeti node {} in the {} cluster'.format( self.fqdn, self.cluster)) ask_confirmation('Is this correct?') if self.fqdn not in self.remote.query('A:ganeti-all').hosts: raise RuntimeError( '{} does have not have the Ganeti role applied. Please fix and re-run the cookbook' .format(self.fqdn)) self.validate_state( 'ls /dev/kvm', 'does have not have virtualisation enabled in BIOS') self.validate_state( 'vgs | grep "ganeti "', ('No "ganeti" volume group found. You need to remove the swap device on /dev/md2, ' 'create a PV on /dev/md2 and eventually create a VG named "ganeti". Make sure to ' 'remove the stale swap entry from fstab as well'), ) self.validate_state( 'brctl show private | grep "en[o|p|s]"', 'No private bridge configured', ) self.validate_state( 'brctl show public | grep "en[o|p|s]"', 'No public bridge configured', ) if self.fqdn in self.remote.query('A:eqiad').hosts: self.validate_state( 'brctl show analytics | grep "en[o|p|s]"', 'No analytics bridge configured', ) self.master.run_sync( 'gnt-node add --no-ssh-key-check -g "{group}" "{node}"'.format( group=self.group, node=self.fqdn)) ask_confirmation('Has the node been added correctly?') self.master.run_sync('gnt-cluster verify') ask_confirmation('Verify that the cluster state looks correct.') self.master.run_sync('gnt-cluster verify-disks') ask_confirmation('Verify that the disk state looks correct.')
def __init__(self, args, spicerack): """Initialize the runner.""" self.debmonitor = spicerack.debmonitor() self.removed_hosts = 0 self.username = spicerack.username try: self.hosts = spicerack.remote().query(args.query).hosts except RemoteError: query_hosts = NodeSet(args.query) ask_confirmation( 'Your query did not match any hosts. This can happen if the host\n' 'record was already removed from Puppetdb, but persists in\n' 'DebMonitor. Do you want to proceed? The following {l} hosts will be\n' 'affected: {query_hosts}\n'. format(l=len(query_hosts), query_hosts=query_hosts)) self.hosts = query_hosts if args.task_id is not None: self.phabricator = spicerack.phabricator(PHABRICATOR_BOT_CONFIG_FILE) self.task_id = args.task_id else: self.phabricator = None self.log_message = 'for {n} hosts: {hosts}'.format(n=len(self.hosts), hosts=self.hosts)
def __init__(self, args, spicerack): """Initialize the runner.""" if args.cluster == 'test': self.suffix = '-test' self.cluster = 'test' elif args.cluster == 'analytics': self.suffix = '' self.cluster = 'analytics' else: raise RuntimeError("Hadoop cluster {} not supported.".format(args.cluster)) ensure_shell_is_durable() self.remote = spicerack.remote() self.hadoop_master = self.remote.query('A:hadoop-master' + self.suffix) self.hadoop_standby = self.remote.query('A:hadoop-standby' + self.suffix) self.icinga_hosts = spicerack.icinga_hosts(self.hadoop_master.hosts | self.hadoop_standby.hosts) self.admin_reason = spicerack.admin_reason('Restart of jvm daemons.') self.yarn_rm_sleep = args.yarn_rm_sleep_seconds self.hdfs_nn_sleep = args.hdfs_nn_sleep_seconds # Safety checks if self.hdfs_nn_sleep < 600: ask_confirmation('The HDFS Namenode restart sleep is less than 600s, are you sure?') if self.yarn_rm_sleep < 60: ask_confirmation('The Yarn Resourcemanager restart sleep is less than 60s, are you sure?') if len(self.hadoop_master) != 1: raise RuntimeError("Expecting exactly one Hadoop master server. Found: {}".format(self.hadoop_master)) if len(self.hadoop_standby) != 1: raise RuntimeError("Expecting exactly one Hadoop standby server. Found: {}".format(self.hadoop_standby)) # This is needed due to the format of the hostname in the command, for example: # sudo -u hdfs /usr/bin/hdfs haadmin -getServiceState an-master1001-eqiad-wmnet self.hadoop_master_service = self.hadoop_master.hosts[0].replace('.', '-') self.hadoop_standby_service = self.hadoop_standby.hosts[0].replace('.', '-') logger.info('Checking HDFS and Yarn daemon status. We expect active statuses on the Master node, ' 'and standby statuses on the other. Please do not proceed otherwise.') print_hadoop_service_state( self.hadoop_master, self.hadoop_master_service, self.hadoop_standby_service) ask_confirmation('Please make sure that the active/standby nodes shown are correct.')
def __init__(self, args, spicerack): """Initiliaze the reimage runner.""" ensure_shell_is_durable() self.args = args self.host = self.args.host self.netbox = spicerack.netbox() self.netbox_server = spicerack.netbox_server(self.host, read_write=True) self.netbox_data = self.netbox_server.as_dict() ask_confirmation( f'ATTENTION: destructive action for host: {self.host}\nAre you sure to proceed?' ) # Shortcut variables self.fqdn = self.netbox_server.fqdn self.mgmt_fqdn = self.netbox_server.mgmt_fqdn self.output_filename = self._get_output_filename(spicerack.username) self.actions = spicerack.actions self.host_actions = self.actions[self.host] self.confctl_services = [] if self.netbox_server.virtual: raise RuntimeError( f'Host {self.host} is a virtual machine. VMs are not yet supported.' ) self.dns = spicerack.dns() self.icinga_host = spicerack.icinga_hosts([self.host]) self.ipmi = spicerack.ipmi(self.mgmt_fqdn) self.reason = spicerack.admin_reason('Host reimage', task_id=self.args.task_id) self.puppet_master = spicerack.puppet_master() self.debmonitor = spicerack.debmonitor() self.confctl = spicerack.confctl('node') self.remote = spicerack.remote() self.spicerack = spicerack try: self.remote_host = self.remote.query(self.fqdn) if self.args.new: ask_confirmation( f'Host {self.fqdn} was found in PuppetDB but --new was set. Are you sure you want to ' 'proceed? The --new option will be unset') self.args.new = False # Unset --new logger.info('The option --new has been unset') except RemoteError as e: self.remote_host = self.remote.query( f'D{{{self.fqdn}}}') # Use the Direct backend instead if not self.args.new: raise RuntimeError( f'Host {self.fqdn} was not found in PuppetDB but --new was not set. Check that the ' 'FQDN is correct. If the host is new or has disappeared from PuppetDB because down ' 'for too long use --new.') from e if len(self.remote_host) != 1: raise RuntimeError( f'Expected 1 host for query {self.fqdn} but got {len(self.remote_host)}: {self.remote_host}' ) # The same as self.remote_host but using the SSH key valid only during installation before the first Puppet run self.remote_installer = spicerack.remote(installer=True).query( self.fqdn) # Get a Puppet instance for the current cumin host to update the known hosts file remote_localhost = self.remote.query(f'{self.reason.hostname}.*') if len(remote_localhost) != 1: raise RuntimeError( f'Localhost matched the wrong number of hosts ({len(remote_localhost)}) for ' f'query "{self.reason.hostname}.*": {remote_localhost}') self.puppet_localhost = spicerack.puppet(remote_localhost) self.puppet = spicerack.puppet(self.remote_host) # The same as self.puppet but using the SSH key valid only during installation before the first Puppet run self.puppet_installer = spicerack.puppet(self.remote_installer) # DHCP automation try: self.dhcp_hosts = self.remote.query( f'A:installserver-light and A:{self.netbox_data["site"]["slug"]}' ) except RemoteError: # Fallback to eqiad's install server if the above fails, i.e. for a new DC self.dhcp_hosts = self.remote.query( 'A:installserver-light and A:eqiad') self.dhcp = spicerack.dhcp(self.dhcp_hosts) self.dhcp_config = self._get_dhcp_config() self._validate() # Keep track of some specific actions for the eventual rollback self.rollback_masks = False self.rollback_depool = False if self.args.task_id is not None: self.phabricator = spicerack.phabricator( PHABRICATOR_BOT_CONFIG_FILE) else: self.phabricator = None