def setup_method(self, *args): # pylint: disable=arguments-differ """Initialize default properties and instances.""" self.target = Target(nodeset('node[1-2]')) self.commands = [Command('command1', ok_codes=[0, 100]), Command('command2', timeout=5)] self.worker = mock.MagicMock() self.worker.current_node = 'node1' self.worker.command = 'command1' self.worker.nodes = self.target.hosts self.handler = None self.args = args
def first_run(self, has_systemd: bool = True) -> Iterator[Tuple]: """Perform the first Puppet run on a clean host without using custom wrappers. Arguments: has_systemd (bool, optional): if the host has systemd as init system. """ commands = [] if has_systemd: commands += [ "systemctl stop puppet.service", "systemctl reset-failed puppet.service || true", ] commands += [ "puppet agent --enable", Command( ("puppet agent --onetime --no-daemonize --verbose --no-splay --show_diff --ignorecache " "--no-usecacheonfailure"), timeout=10800, ), ] logger.info( "Starting first Puppet run (sit back, relax, and enjoy the wait)") results = self._remote_hosts.run_sync(*commands, print_output=False, print_progress_bars=False) logger.info("First Puppet run completed") return results
def check_patterns_in_repo(host_paths, patterns): """Git grep for all the given patterns in the given hosts and path and ask for confirmation if any is found. Arguments: host_paths (sequence): a sequence of 2-item tuples with the RemoteHost instance and the path of the repositories to check. patterns (sequence): a sequence of patterns to check. """ grep_command = "git -C '{{path}}' grep -E '({patterns})'".format( patterns='|'.join(patterns)) ask = False for remote_host, path in host_paths: logger.info('Looking for matches in %s:%s', remote_host, path) for _nodeset, _output in remote_host.run_sync( Command(grep_command.format(path=path), ok_codes=[])): ask = True if ask: ask_confirmation( 'Found match(es) in the Puppet or mediawiki-config repositories ' '(see above), proceed anyway?') else: logger.info( 'No matches found in the Puppet or mediawiki-config repositories')
def _populate_puppetdb(self): """Run Puppet in noop mode to populate the exported resources in PuppetDB to downtime it on Icinga.""" self.remote_installer.run_sync(Command( 'puppet agent -t --noop &> /dev/null', ok_codes=[]), print_progress_bars=False) self.host_actions.success( 'Run Puppet in NOOP mode to populate exported resources in PuppetDB' ) @retry(tries=50, backoff_mode='linear') def poll_puppetdb(): """Poll PuppetDB until we find the Nagios_host resource for the newly installed host.""" puppetdb_host = self.dns.resolve_ptr( self.dns.resolve_ipv4('puppetdb-api.discovery.wmnet')[0])[0] response = requests.post( f'https://{puppetdb_host}/pdb/query/v4/resources/Nagios_host/{self.host}' ) json_response = response.json() if not json_response: # PuppetDB returns empty list for non-matching results raise SpicerackError( f'Nagios_host resource with title {self.host} not found yet' ) if len(json_response) != 1: raise RuntimeError( f'Expected 1 result from PuppetDB got {len(json_response)}' ) if json_response[0]['exported'] is not True: raise RuntimeError( f'Expected the Nagios_host resource to be exported, got: {json_response[0]["exported"]}' ) poll_puppetdb() self.host_actions.success( 'Found Nagios_host resource for this host in PuppetDB')
def get_status(self, service_re: str = "") -> HostsStatus: """Get the current status of the given hosts from Icinga. Arguments: service_re (str): if non-empty, the regular expression matching service names Returns: spicerack.icinga.HostsStatus: the instance that represents the status for the given hosts. Raises: IcingaError: if unable to get the status. IcingaStatusParseError: when failing to parse the status. IcingaStatusNotFoundError: if a host is not found in the Icinga status. re.error: if service_re is an invalid regular expression. """ if service_re: # Compile the regex and ignore the result, in order to raise re.error if it's malformed. re.compile(service_re) # icinga-status exits with non-zero exit code on missing and non-optimal hosts. verbatim = " --verbatim-hosts" if self._verbatim_hosts else "" services = (" --services " + shlex.quote(service_re)) if service_re else "" command = Command( f'/usr/local/bin/icinga-status -j{verbatim}{services} "{self._target_hosts}"', ok_codes=[], ) for _, output in self._icinga_host.run_sync( command, is_safe=True, print_output=False, print_progress_bars=False ): # icinga-status is a read-only script json_status = output.message().decode() break else: raise IcingaError( "Unable to get the status for the given hosts, no output from icinga-status" ) try: status = json.loads(json_status) except json.JSONDecodeError as e: raise IcingaStatusParseError( "Unable to parse Icinga status") from e missing_hosts = [ hostname for hostname, host_status in status.items() if host_status is None ] if missing_hosts: raise IcingaStatusNotFoundError(missing_hosts) return HostsStatus({ hostname: HostStatus(**host_status) for hostname, host_status in status.items() })
def _prepend_sudo(command: Union[str, Command]) -> Union[str, Command]: if isinstance(command, str): return "sudo -i " + command return Command( "sudo -i " + command.command, timeout=command.timeout, ok_codes=command.ok_codes, )
def setup_method(self, _, task_self): # pylint: disable=arguments-differ """Initialize default properties and instances.""" self.config = { 'clustershell': { 'ssh_options': ['-o StrictHostKeyChecking=no', '-o BatchMode=yes'], 'fanout': 3}} self.target = Target(nodeset('node[1-2]')) self.worker = clustershell.worker_class(self.config, self.target) self.commands = [Command('command1'), Command('command2', ok_codes=[0, 100], timeout=5)] self.task_self = task_self # Mock default handlers clustershell.DEFAULT_HANDLERS = { 'sync': mock.MagicMock(spec_set=clustershell.SyncEventHandler), 'async': mock.MagicMock(spec_set=clustershell.AsyncEventHandler)} # Initialize the worker self.worker.commands = self.commands
def test_recheck_failed_services_optimal(self): """It should force a recheck of all services for the hosts on the Icinga server.""" with open(get_fixture_path("icinga", "status_with_services.json")) as f: set_mocked_icinga_host_output(self.mocked_icinga_host, f.read()) self.icinga_hosts.recheck_failed_services() # This also ensures that we are not making an additional call of run_sync in the recheck method self.mocked_icinga_host.run_sync.assert_called_with( Command('/usr/local/bin/icinga-status -j "host1"', ok_codes=[]), is_safe=True, print_output=False, print_progress_bars=False, )
def find_kerberos_credentials(remote_host, decom_hosts): """Check if any host provided has a kerberos keytab stored on the KDC hosts.""" cred_found = False logger.info('Looking for Kerberos credentials on KDC kadmin node.') for host in decom_hosts: find_keytabs_command = 'find {} -name "{}*"'.format( KERBEROS_KDC_KEYTAB_PATH, host) check_princs_command = '/usr/local/sbin/manage_principals.py list "*{}*"'.format( host) cumin_commands = [ Command(find_keytabs_command, ok_codes=[]), Command(check_princs_command, ok_codes=[]) ] for _nodeset, _output in remote_host.run_sync(*cumin_commands): cred_found = True if cred_found: logger.info( 'Please follow this guide to drop unused credentials: ' 'https://wikitech.wikimedia.org/wiki/Analytics/Systems/Kerberos' '#Delete_Kerberos_principals_and_keytabs_when_a_host_is_decommissioned' ) else: logger.info('No Kerberos credentials found.')
def stop_periodic_jobs(self, datacenter: str) -> None: """Remove and ensure MediaWiki periodic jobs are disabled in the given DC. Arguments: datacenter (str): the name of the datacenter to work on. Raises: spicerack.remote.RemoteExecutionError: on failure. """ targets = self.get_maintenance_host(datacenter) logger.info("Disabling MediaWiki periodic jobs in %s", datacenter) pkill_ok_codes = [0, 1] # Accept both matches and no matches # Stop all systemd job units and timers targets.run_async("systemctl stop mediawiki_job_*") targets.run_async( # Kill MediaWiki wrappers, in case someone has started one manually. See modules/scap/manifests/scripts.pp # in the Puppet repo. Command('pkill --full "/usr/local/bin/foreachwiki"', ok_codes=pkill_ok_codes), Command( 'pkill --full "/usr/local/bin/foreachwikiindblist"', ok_codes=pkill_ok_codes, ), Command('pkill --full "/usr/local/bin/expanddblist"', ok_codes=pkill_ok_codes), Command('pkill --full "/usr/local/bin/mwscript"', ok_codes=pkill_ok_codes), Command('pkill --full "/usr/local/bin/mwscriptwikiset"', ok_codes=pkill_ok_codes), # Kill all remaining PHP (but not php-fpm) processes for all users Command("killall -r 'php$'", ok_codes=[]), "sleep 5", # No more time to be gentle Command("killall -9 -r 'php$'", ok_codes=[]), "sleep 1", ) self.check_periodic_jobs_disabled(datacenter) try: # Look for remaining PHP (but not php-fpm) processes. php-fpm is used for # serving noc.wikimedia.org, which is independent of periodic jobs targets.run_sync("! pgrep -c 'php$'", is_safe=True) except RemoteExecutionError: # We just log an error, don't actually report a failure to the system. We can live with this. logger.error( "Stray php processes still present on the %s maintenance host, please check", datacenter)
def _httpbb(self): """Run the httpbb tests.""" if not self.args.httpbb: return command = Command( f'httpbb /srv/deployment/httpbb-tests/appserver/* --host={self.fqdn}', timeout=120) deployment_host = self.remote.query( self.dns.resolve_cname('deployment.eqiad.wmnet')) logger.info('Running httpbb tests') try: deployment_host.run_sync(command, print_progress_bars=False) self.host_actions.success('Run of httpbb tests was successful') except RemoteExecutionError: # We don't want to fail upon this failure, this is just a validation test for the user. self.host_actions.warning('//Failed to run httpbb tests//')
def regenerate_certificate(self) -> Dict[str, str]: """Delete the local Puppet certificate and generate a new CSR. Returns: dict: a dictionary with hostnames as keys and CSR fingerprint as values. """ logger.info("Deleting local Puppet certificate on %d hosts: %s", len(self), self) self._remote_hosts.run_sync("rm -rfv /var/lib/puppet/ssl") fingerprints = {} errors = [] # The return codes for the cert generation are not well defined, we'll # check if it worked by searching for the fingerprint and parsing the # output. command = Command("puppet agent --test --color=false", ok_codes=[]) logger.info("Generating a new Puppet certificate on %d hosts: %s", len(self), self) for nodeset, output in self._remote_hosts.run_sync(command, print_output=False): for line in output.message().decode().splitlines(): if line.startswith("Error:"): errors.append((nodeset, line)) continue if "Certificate Request fingerprint" not in line: continue fingerprint = ":".join(line.split(":")[2:]).strip() if not fingerprint: continue logger.info("Generated CSR for host %s: %s", nodeset, fingerprint) for host in nodeset: fingerprints[host] = fingerprint if len(fingerprints) != len(self): raise PuppetHostsError( "Unable to find CSR fingerprints for all hosts, detected errors are:\n" + "\n".join(f"{nodeset}: {line}" for nodeset, line in errors)) return fingerprints
def run( # pylint: disable=too-many-arguments self, timeout: int = 300, enable_reason: Optional[Reason] = None, quiet: bool = False, failed_only: bool = False, force: bool = False, attempts: int = 0, batch_size: int = 10, ) -> None: """Run Puppet. Arguments: timeout (int, optional): the timeout in seconds to set in Cumin for the execution of the command. enable_reason (spicerack.administrative.Reason, optional): the reason to use to contextually re-enable Puppet if it was disabled. quiet (bool, optional): suppress Puppet output if True. failed_only (bool, optional): run Puppet only if the last run failed. force (bool, optional): forcely re-enable Puppet if it was disabled with ANY message. attempts (int, optional): override the default number of attempts waiting that an in-flight Puppet run completes before timing out as set in run-puppet-agent. batch_size (int, optional): how many concurrent Puppet runs to perform. The default value is tailored to not overload the Puppet masters. """ args = [] if enable_reason is not None: args += ["--enable", enable_reason.quoted()] if quiet: args.append("--quiet") if failed_only: args.append("--failed-only") if force: args.append("--force") if attempts: args += ["--attempts", str(attempts)] args_string = " ".join(args) command = f"run-puppet-agent {args_string}" logger.info("Running Puppet with args %s on %d hosts: %s", args_string, len(self), self) self._remote_hosts.run_sync(Command(command, timeout=timeout), batch_size=batch_size)
def check_periodic_jobs_disabled(self, datacenter: str) -> None: """Check that MediaWiki periodic jobs are not enabled in the given DC. Arguments: datacenter (str): the name of the datacenter to work on. Raises: spicerack.remote.RemoteExecutionError: on failure. """ targets = self.get_maintenance_host(datacenter) targets.run_async( Command( # List all timers that start with mediawiki_job_ "systemctl list-units 'mediawiki_job_*' --no-legend " # Just get the timer name "| awk '{print $1}' " # For each, check `systemd is-enabled`, which will pass if # the unit is enabled. Invert the status code so only disabled # pass. 255 instructs xargs to immediately abort. "| xargs -n 1 sh -c 'systemctl is-enabled $0 && exit 255 || exit 0'", ), is_safe=True, )
def run(args, spicerack): # pylint: disable=too-many-locals """Required by Spicerack API.""" remote = spicerack.remote() netbox_hostname = spicerack.dns().resolve_cname(NETBOX_DOMAIN) netbox_host = remote.query(netbox_hostname) netbox_hosts = remote.query(NETBOX_HOSTS_QUERY) reason = spicerack.admin_reason(args.message, task_id=args.task_id) # Always set an accessible CWD for runuser because the Python git module passes it to Popen base_command = ('cd /tmp && runuser -u {user} -- python3 ' '/srv/deployment/netbox-extras/dns/generate_dns_snippets.py').format(user=NETBOX_USER) extra_options = '' if args.emergency_manual_edit: extra_options = '--keep-files ' command_str = ('{base} commit {opts}--batch "{owner}: {msg}"').format( opts=extra_options, base=base_command, owner=reason.owner, msg=args.message) # NO_CHANGES_RETURN_CODE = 99 in generate_dns_snippets.py command = Command(command_str, ok_codes=[0, 99]) logger.info('Generating the DNS records from Netbox data. It will take a couple of minutes.') results = netbox_host.run_sync(command, is_safe=True) metadata = {} for _, output in results: lines = output.message().decode() for line in lines.splitlines(): if line.startswith('METADATA:'): metadata = json.loads(line.split(maxsplit=1)[1]) break if spicerack.dry_run: if not metadata.get('no_changes', False): logger.info('Bailing out in DRY-RUN mode. Generated temporary files are available on %s:%s', netbox_hostname, metadata.get('path')) return if args.emergency_manual_edit: logger.info('Generated temporary files are available on %s:%s', netbox_hostname, metadata.get('path')) logger.info('SSH there, as root modify any file, git stage them and run "git commit --amend" to commit them') logger.info('Then run "git log --pretty=oneline -1" and copy the new SHA1 of HEAD') metadata['sha1'] = input('Enter the new SHA1 of the commit to push: ') metadata['no_changes'] = False if metadata.get('no_changes', False): if args.force: logger.info('No changes to deploy but --force set to %s, continuing.', args.force) sha1 = args.force else: logger.info('No changes to deploy.') return else: ask_confirmation('Have you checked that the diff is OK?') sha1 = metadata.get('sha1', '') if not sha1: raise RuntimeError('Unable to fetch SHA1 from commit metadata: {meta}'.format(meta=metadata)) command = ('{base} push "{path}" "{sha1}"').format(base=base_command, path=metadata.get('path', ''), sha1=sha1) results = netbox_host.run_sync(command) passive_netbox_hosts = remote.query(str(netbox_hosts.hosts - netbox_host.hosts)) logger.info('Updating the Netbox passive copies of the repository on %s', passive_netbox_hosts) passive_netbox_hosts.run_sync('runuser -u {user} -- git -C "{path}" fetch {host} master:master'.format( path=NETBOX_BARE_REPO_PATH, user=NETBOX_USER, host=netbox_hostname)) authdns_hosts = remote.query(AUTHDNS_HOSTS_QUERY) logger.info('Updating the authdns copies of the repository on %s', authdns_hosts) authdns_hosts.run_sync( 'runuser -u {user} -- git -C "{path}" fetch && git -C "{path}" merge --ff-only {sha1}'.format( path=AUTHDNS_NETBOX_CHECKOUT_PATH, user=AUTHDNS_USER, sha1=sha1)) if args.skip_authdns_update: logger.warning(('ATTENTION! Skipping deploy of the updated zonefiles. The next manual authdns-update or ' 'run of this cookbook will deploy the changes!')) else: logger.info('Deploying the updated zonefiles on %s', authdns_hosts) authdns_hosts.run_sync('cd {git} && utils/deploy-check.py -g {netbox} --deploy'.format( git=AUTHDNS_DNS_CHECKOUT_PATH, netbox=AUTHDNS_NETBOX_CHECKOUT_PATH))
def test_node_class_instantiation(): """Default values should be set when a Node instance is created.""" node = clustershell.Node('name', [Command('command1'), Command('command2')]) assert node.running_command_index == -1 assert isinstance(node.state, State)