def run(self):
        """Reboot all Kafka nodes on a given cluster"""
        ask_confirmation(
            'Please check the Grafana dashboard of the cluster and make sure that '
            'topic partition leaders are well balanced and that all brokers are up and running.'
        )

        logger.info(
            'Checking that all Kafka brokers are reported up by their systemd unit status.'
        )
        self.kafka_brokers.run_sync('systemctl status kafka')

        logger.info('Checking if /etc/profile.d/kafka.sh can be sourced.')
        self.kafka_brokers.run_sync('source /etc/profile.d/kafka.sh')

        if self.sleep_before_pref_replica_election < 900:
            ask_confirmation(
                'The sleep time between a node restart and kafka preferred-replica-election '
                'is less than 900 seconds. The broker needs some time to recover after a restart. '
                'Are you sure?')

        for host in self.kafka_brokers.hosts:
            logger.info('Starting reboot of kafka host %s', host)
            self.reboot_kafka_node(host)

            logger.info('Sleeping %s before next host',
                        self.batch_sleep_seconds)
            sleep(self.batch_sleep_seconds)

        logger.info('All Kafka node reboots completed!')
    def run(self):
        """Required by Spicerack API."""
        ask_confirmation(
            'If a config change is being rolled-out, please run puppet on all hosts '
            'before proceeding.')

        with self.icinga_hosts.downtimed(self.admin_reason,
                                         duration=timedelta(minutes=60)):
            logger.info("Depool and test on canary: %s", self.aqs_canary.hosts)
            self.aqs_canary.run_sync('depool', 'systemctl restart aqs')
            ask_confirmation('Please test aqs on the canary.')
            logger.info('Pool the canary back.')
            self.aqs_canary.run_sync('pool')

            aqs_lbconfig = self.remote.query_confctl(
                self.confctl,
                cluster=self.cluster,
                name=r'(?!' + self.aqs_canary.hosts[0] + ').*')

            logger.info('Restarting remaining daemons (one host at a time).')
            aqs_lbconfig.run('systemctl restart aqs',
                             svc_to_depool=['aqs'],
                             batch_size=1,
                             max_failed_batches=2,
                             batch_sleep=30.0)

        logger.info("All AQS service restarts completed!")
Exemplo n.º 3
0
    def _reboot(self, hosts: NodeSet) -> None:
        """Reboot a set of hosts with downtime

        Arguments:
            hosts (`NodeSet`): A list of hosts to reboot

        """
        puppet = self._spicerack.puppet(hosts)
        icinga_hosts = self._spicerack.icinga_hosts(hosts.hosts)
        try:
            duration = timedelta(minutes=20)
            with icinga_hosts.downtimed(self.reason, duration=duration):
                reboot_time = datetime.utcnow()
                confirm_on_failure(hosts.reboot, batch_size=len(hosts))
                hosts.wait_reboot_since(reboot_time, print_progress_bars=False)
                puppet.run(quiet=True)
                puppet.wait_since(reboot_time)
                icinga_hosts.wait_for_optimal()
            self.results.success(hosts.hosts)
        except IcingaError as error:
            ask_confirmation(f'Failed to downtime hosts: {error}')
            self.logger.warning(error)

        except AbortError as error:
            # Some host failed to come up again, or something fundamental broke.
            # log an error, continue *without* repooling
            self.logger.error(error)
            self.logger.error(
                'Error rebooting: Hosts %s, they may still be depooled', hosts)
            self.results.fail(hosts.hosts)
            raise
    def __init__(self, args, spicerack):
        """Initialize an Hadoop worker."""
        self.success_percent_cumin = args.success_percent / 100
        self.skip_disks = args.skip_disks
        self.disks_number = args.disks_number
        self.hostname_pattern = args.hostname_pattern
        self.partitions_basedir = args.partitions_basedir
        self.wipe_partitions = args.wipe_partitions
        self.hadoop_workers = spicerack.remote().query(self.hostname_pattern)

        letters = list(string.ascii_lowercase)
        if len(letters[self.skip_disks:]) < self.disks_number:
            raise RuntimeError(
                'The number of available letters is not enough to support {} disks, '
                'please check your parameters:\n{}'.format(
                    self.disks_number, letters[self.skip_disks:]))

        self.available_disk_labels = letters[self.
                                             skip_disks:self.disks_number +
                                             self.skip_disks]

        ask_confirmation(
            'Please check that the hosts to initialize are the expected ones: {}'
            .format(self.hadoop_workers.hosts))

        ask_confirmation(
            'Please check that the disk labels to act on are the expected '
            'ones: {}'.format(str(self.available_disk_labels)))

        ensure_shell_is_durable()
    def __init__(self, args, spicerack):
        """Change Hadoop distribution on all the clients of a given cluster"""
        if args.cluster == 'test':
            cumin_labels = HADOOP_TEST_CLIENT_CUMIN_ALIASES
        elif args.cluster == 'analytics':
            cumin_labels = HADOOP_CLIENT_CUMIN_ALIASES
        else:
            raise RuntimeError("Hadoop cluster {} not supported.".format(args.cluster))

        ensure_shell_is_durable()

        spicerack_remote = spicerack.remote()
        if args.cumin_client_label:
            if args.cumin_client_label not in cumin_labels:
                raise RuntimeError(
                    "Cumin label {} not supported. Please use one of: {}"
                    .format(args.cumin_client_label, cumin_labels))
            cumin_labels = [args.cumin_client_label]

        self.hadoop_client_hosts = spicerack_remote.query(' or '.join(cumin_labels))
        self.icinga_hosts = spicerack.icinga_hosts(self.hadoop_client_hosts.hosts)
        self.admin_reason = spicerack.admin_reason('Change Hadoop distribution')
        self.rollback = args.rollback
        self.cluster = args.cluster

        ask_confirmation(
            "This cookbook assumes that the Hadoop cluster runs already the new distro, "
            "please do not proceed otherwise.")
Exemplo n.º 6
0
def check_patterns_in_repo(host_paths, patterns):
    """Git grep for all the given patterns in the given hosts and path and ask for confirmation if any is found.

    Arguments:
        host_paths (sequence): a sequence of 2-item tuples with the RemoteHost instance and the path of the
            repositories to check.
        patterns (sequence): a sequence of patterns to check.

    """
    grep_command = "git -C '{{path}}' grep -E '({patterns})'".format(
        patterns='|'.join(patterns))
    ask = False
    for remote_host, path in host_paths:
        logger.info('Looking for matches in %s:%s', remote_host, path)
        for _nodeset, _output in remote_host.run_sync(
                Command(grep_command.format(path=path), ok_codes=[])):
            ask = True

    if ask:
        ask_confirmation(
            'Found match(es) in the Puppet or mediawiki-config repositories '
            '(see above), proceed anyway?')
    else:
        logger.info(
            'No matches found in the Puppet or mediawiki-config repositories')
Exemplo n.º 7
0
 def run(self):
     """Required by Spicerack API."""
     ask_confirmation(self.message)
     try:
         self.remote_hosts.run_sync(self.command)
     except RemoteExecutionError:
         logger.error('Some hosts were not reachable, see the list above. Typically caused by hardware maintenance.')
def test_ask_confirmation_abort(mocked_isatty, mocked_input):
    """Calling ask_confirmation() should raise AbortError if 'abort' is provided."""
    mocked_isatty.return_value = True
    mocked_input.return_value = 'abort'
    message = 'Test message'
    with pytest.raises(interactive.AbortError, match='Confirmation manually aborted'):
        interactive.ask_confirmation(message)
def test_ask_confirmation_go(mocked_isatty, mocked_input, capsys):
    """Calling ask_confirmation() should not raise if the correct answer is provided."""
    mocked_isatty.return_value = True
    mocked_input.return_value = 'go'
    message = 'Test message'
    interactive.ask_confirmation(message)
    out, _ = capsys.readouterr()
    assert message in out
    def run(self):
        """Restart all Hadoop jvm daemons on a given cluster"""
        with self.icinga_hosts.downtimed(self.admin_reason, duration=timedelta(minutes=120)):
            logger.info("Restarting Yarn Resourcemanager on Master.")
            self.hadoop_master.run_sync('systemctl restart hadoop-yarn-resourcemanager')
            logger.info("Sleeping %s seconds.", self.yarn_rm_sleep)
            time.sleep(self.yarn_rm_sleep)
            logger.info("Restarting Yarn Resourcemanager on Standby.")
            self.hadoop_standby.run_sync('systemctl restart hadoop-yarn-resourcemanager')

            print_hadoop_service_state(
                self.hadoop_master, self.hadoop_master_service, self.hadoop_standby_service, hdfs=False)

            ask_confirmation("Ok to proceed with HDFS Namenodes ?")

            logger.info("Run manual HDFS failover from master to standby.")
            run_hdfs_namenode_failover(self.hadoop_master, self.hadoop_master_service, self.hadoop_standby_service)

            logger.info("Sleeping 30 seconds.")
            time.sleep(30)

            logger.info("Restart HDFS Namenode on the master.")
            self.hadoop_master.run_async(
                'systemctl restart hadoop-hdfs-zkfc',
                'systemctl restart hadoop-hdfs-namenode')

            logger.info("Sleeping %s seconds.", self.hdfs_nn_sleep)
            time.sleep(self.hdfs_nn_sleep)

            print_hadoop_service_state(
                self.hadoop_master, self.hadoop_master_service, self.hadoop_standby_service, yarn=False)

            ask_confirmation("Ok to proceed?")

            logger.info("Run manual HDFS failover from standby to master.")
            run_hdfs_namenode_failover(self.hadoop_master, self.hadoop_standby_service, self.hadoop_master_service)

            logger.info("Sleeping 30 seconds.")
            time.sleep(30)

            logger.info("Restart HDFS Namenode on the standby.")
            self.hadoop_standby.run_async(
                'systemctl restart hadoop-hdfs-zkfc',
                'systemctl restart hadoop-hdfs-namenode')

            logger.info("Sleeping %s seconds.", self.hdfs_nn_sleep)
            time.sleep(self.hdfs_nn_sleep)

            logger.info("\n\nSummary of active/standby statuses after the restarts:")

            print_hadoop_service_state(
                self.hadoop_master, self.hadoop_master_service, self.hadoop_standby_service)

            logger.info("Restart MapReduce historyserver on the master.")
            self.hadoop_master.run_sync('systemctl restart hadoop-mapreduce-historyserver')
Exemplo n.º 11
0
    def __init__(self, args, spicerack):
        """Initialize the runner"""
        if args.cluster == 'test':
            self.cluster_cumin_alias = 'A:hadoop-worker-test'
            self.hdfs_jn_cumin_alias = 'A:hadoop-hdfs-journal-test'
        elif args.cluster == 'analytics':
            self.cluster_cumin_alias = 'A:hadoop-worker'
            self.hdfs_jn_cumin_alias = 'A:hadoop-hdfs-journal'
        else:
            raise RuntimeError("Hadoop cluster {} not supported.".format(
                args.cluster))

        ensure_shell_is_durable()

        self.cluster = args.cluster
        self.hadoop_workers = spicerack.remote().query(
            self.cluster_cumin_alias)
        self.hadoop_hdfs_journal_workers = spicerack.remote().query(
            self.hdfs_jn_cumin_alias)
        self.icinga_hosts = spicerack.icinga_hosts(self.hadoop_workers.hosts)
        self.admin_reason = spicerack.admin_reason(
            'Roll restart of jvm daemons for openjdk upgrade.')

        self.yarn_nm_batch_size = args.yarn_nm_batch_size
        self.yarn_nm_sleep = args.yarn_nm_sleep_seconds

        # Not configurable on purpose, too risky!
        self.hdfs_jn_batch_size = 1
        self.hdfs_jn_sleep = args.hdfs_jn_sleep_seconds

        self.hdfs_dn_batch_size = args.hdfs_dn_batch_size
        self.hdfs_dn_sleep = args.hdfs_dn_sleep_seconds

        # Safety checks
        if self.hdfs_dn_batch_size > 5:
            ask_confirmation(
                'The HDFS Datanode batch size is bigger than 5, are you sure?')
        if self.hdfs_dn_sleep < 20:
            ask_confirmation(
                'The HDFS Datanode sleep between each batch is less than 20s, are you sure?'
            )
        if self.hdfs_jn_sleep < 20:
            ask_confirmation(
                'The HDFS Journalnode sleep between each batch is less than 20s, are you sure?'
            )
        if self.yarn_nm_batch_size > 10:
            ask_confirmation(
                'The Yarn Nodemanager batch size is bigger than 10, are you sure?'
            )
        if self.yarn_nm_sleep < 20:
            ask_confirmation(
                'The Yarn Nodemanager sleep between each batch is less than 20s, are you sure?'
            )
Exemplo n.º 12
0
    def __init__(self, args, spicerack):
        """Change Hadoop distribution on a given cluster"""
        if args.cluster == 'test':
            suffix = '-test'
        elif args.cluster == 'analytics':
            suffix = ''
        else:
            raise RuntimeError("Hadoop cluster {} not supported.".format(args.cluster))

        ensure_shell_is_durable()

        spicerack_remote = spicerack.remote()

        self.hadoop_hosts = spicerack_remote.query(CLUSTER_CUMIN_ALIAS + suffix)
        self.hadoop_hdfs_journal_workers = spicerack_remote.query(HDFS_JOURNAL_CUMIN_ALIAS + suffix)
        if args.journalnodes_cumin_query:
            hadoop_hdfs_journal_override = spicerack_remote.query(args.journalnodes_cumin_query)
            self.hadoop_hdfs_journal_workers = spicerack_remote.query(
                "D{{{}}}".format(
                    self.hadoop_hdfs_journal_workers.hosts.intersection(hadoop_hdfs_journal_override.hosts)))
            ask_confirmation(
                'The cookbook will run only on the following journal hosts ({}), please verify that '
                'the list looks correct: {}'
                .format(len(self.hadoop_hdfs_journal_workers), self.hadoop_hdfs_journal_workers))

        self.hadoop_workers = spicerack_remote.query(WORKERS_CUMIN_ALIAS + suffix)
        if args.workers_cumin_query:
            hadoop_workers_override = spicerack_remote.query(args.workers_cumin_query)
            self.hadoop_workers = spicerack_remote.query(
                "D{{{}}}".format(self.hadoop_workers.hosts.intersection(hadoop_workers_override.hosts)))
            ask_confirmation(
                'The cookbook will run only on the following worker hosts ({}), please verify that '
                'the list looks correct: {}'
                .format(len(self.hadoop_workers), self.hadoop_workers))

        self.hadoop_master = spicerack_remote.query(MASTER_CUMIN_ALIAS + suffix)
        self.hadoop_standby = spicerack_remote.query(STANDBY_CUMIN_ALIAS + suffix)

        self.icinga_hosts = spicerack.icinga_hosts(self.hadoop_hosts.hosts)
        self.admin_reason = spicerack.admin_reason('Change Hadoop distribution')

        self.rollback = args.rollback
        self.cluster = args.cluster

        self.apt_install_options = '-y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold"'

        # Workaround needed for https://issues.apache.org/jira/browse/YARN-8310
        self.yarn_metadata_cleanup_commands = [
            f'setAcl /yarn-rmstore/analytics{suffix}-hadoop/ZKRMStateRoot world:anyone:cdrwa',
            f'rmr /yarn-rmstore/analytics{suffix}-hadoop/ZKRMStateRoot']
Exemplo n.º 13
0
    def _install_os(self):
        """Perform the OS reinstall."""
        pxe_reboot_time = datetime.utcnow()
        self.ipmi.force_pxe()
        self.host_actions.success('Forced PXE for next reboot')
        self.ipmi.reboot()
        self.host_actions.success('Host rebooted via IPMI')
        self.remote_installer.wait_reboot_since(pxe_reboot_time,
                                                print_progress_bars=False)
        time.sleep(
            30
        )  # Avoid race conditions, the host is in the d-i, need to wait anyway
        di_reboot_time = datetime.utcnow()
        env_command = 'grep -q "BOOT_IMAGE=debian-installer" /proc/cmdline'
        try:
            self.remote_installer.run_sync(env_command,
                                           print_output=False,
                                           print_progress_bars=False)
        except RemoteExecutionError:
            ask_confirmation(
                'Unable to verify that the host is inside the Debian installer, please verify manually '
                f'with: sudo install_console {self.fqdn}')

        self.host_actions.success('Host up (Debian installer)')
        self.remote_installer.wait_reboot_since(di_reboot_time,
                                                print_progress_bars=False)
        try:
            self.remote_installer.run_sync(f'! {env_command}',
                                           print_output=False,
                                           print_progress_bars=False)
        except RemoteExecutionError:
            ask_confirmation(
                'Unable to verify that the host rebooted into the new OS, it might still be into the '
                f'Debian installer, please verify manually with: sudo install_console {self.fqdn}'
            )

        result = self.remote_installer.run_sync('lsb_release -sc',
                                                print_output=False,
                                                print_progress_bars=False)
        for _, output in result:
            distro = output.message().decode()

        if distro != self.args.os:
            message = f'New OS is {distro} but {self.args.os} was requested'
            self.host_actions.failure(message)
            raise RuntimeError(message)

        self.host_actions.success(f'Host up (new fresh {distro} OS)')
Exemplo n.º 14
0
    def __init__(self, args, spicerack):
        """Decommission a host from all inventories."""
        ensure_shell_is_durable()
        self.remote = spicerack.remote()
        try:
            self.decom_hosts = self.remote.query(args.query).hosts
        except RemoteError:
            logger.debug("Query '%s' did not match any host or failed",
                         args.query,
                         exc_info=True)
            decom_hosts = NodeSet(args.query)
            ask_confirmation(
                'ATTENTION: the query does not match any host in PuppetDB or failed\n'
                'Hostname expansion matches {n} hosts: {hosts}\n'
                'Do you want to proceed anyway?'.format(n=len(decom_hosts),
                                                        hosts=decom_hosts))
            self.decom_hosts = decom_hosts

        if len(self.decom_hosts) > 20:
            raise RuntimeError(
                'Matched {} hosts, aborting. (max 20 with --force, 5 without)'.
                format(len(self.decom_hosts)))

        if len(self.decom_hosts) > 5:
            if args.force:
                logger.info(
                    'Authorized decommisioning of %s hosts with --force',
                    len(self.decom_hosts))
            else:
                raise RuntimeError(
                    'Matched {} hosts, and --force not set aborting. (max 20 with --force, 5 without)'
                    .format(len(self.decom_hosts)))

        ask_confirmation(
            'ATTENTION: destructive action for {n} hosts: {hosts}\nAre you sure to proceed?'
            .format(n=len(self.decom_hosts), hosts=self.decom_hosts))

        self.spicerack = spicerack
        self.task_id = args.task_id
        self.puppet_master = self.remote.query(get_puppet_ca_hostname())
        self.kerberos_kadmin = self.remote.query(KERBEROS_KADMIN_CUMIN_ALIAS)
        self.dns = self.spicerack.dns()
        self.deployment_host = self.remote.query(
            self.dns.resolve_cname(DEPLOYMENT_HOST))
        self.patterns = get_grep_patterns(self.dns, self.decom_hosts)
        self.reason = self.spicerack.admin_reason('Host decommission',
                                                  task_id=self.task_id)
Exemplo n.º 15
0
def run(args, spicerack):
    """Required by Spicerack API."""
    post_process_args(args)
    if args.live_test:
        logger.info('Inverting DC to perform the warmup in %s (passive DC)',
                    args.dc_from)
        datacenter = args.dc_from
    else:
        datacenter = args.dc_to

    ask_confirmation(
        'Are you sure to warmup caches in {dc}?'.format(dc=datacenter))

    warmup_dir = '/var/lib/mediawiki-cache-warmup'
    # urls-cluster is only running against appservers since is for shared resources behind the
    # servers themselves
    warmups = [
        "nodejs {dir}/warmup.js {dir}/urls-cluster.txt spread appservers.svc.{dc}.wmnet"
        .format(dir=warmup_dir, dc=datacenter)
    ]
    for cluster in ["appserver", "api_appserver"]:
        # urls-server runs against both appserver and API clusters since it's for each individual server
        warmups.append(
            "nodejs {dir}/warmup.js {dir}/urls-server.txt clone {cluster} {dc}"
            .format(dir=warmup_dir, dc=datacenter, cluster=cluster))

    maintenance_host = spicerack.mediawiki().get_maintenance_host(datacenter)
    # It takes multiple executions of the warmup script to fully warm up the appserver caches. The second run is faster
    # than the first, and so on. Empirically, we consider the caches to be fully warmed up when this speedup disappears;
    # that is, when the execution time converges, and each attempt takes about as long as the one before.
    logger.info('Running warmup script in %s.', datacenter)
    logger.info('The script will re-run until execution time converges.')
    last_duration = datetime.timedelta.max
    for i in itertools.count(1):
        logger.info('Running warmup script, take %d', i)
        start_time = datetime.datetime.utcnow()
        maintenance_host.run_sync(*warmups)
        duration = datetime.datetime.utcnow() - start_time
        logger.info('Warmup completed in %s', duration)
        # After we've done a minimum number of iterations, we stop looping as soon as the warmup script takes more
        # than 95% as long as the previous run. That is, keep looping as long as it keeps going faster than before,
        # but with a 5% margin of error. At that point, any further reduction is probably just noise.
        if i >= MINIMUM_ITERATIONS and duration > 0.95 * last_duration:
            break
        last_duration = duration
    logger.info('Execution time converged, warmup complete.')
Exemplo n.º 16
0
    def __init__(self, args, spicerack):
        """Initialize the runner"""
        ensure_shell_is_durable()

        self.cluster_cumin_alias = "A:zookeeper-" + args.cluster
        self.zookeeper = spicerack.remote().query(self.cluster_cumin_alias)
        self.icinga_hosts = spicerack.icinga_hosts(self.zookeeper.hosts)
        self.admin_reason = spicerack.admin_reason(
            'Roll restart of jvm daemons.')
        self.batch_sleep_seconds = args.batch_sleep_seconds

        # Safety checks
        self.zookeeper.run_sync('echo stats | nc -q 1 localhost 2181')

        logger.info('\n=========================================\n')
        ask_confirmation(
            'Please check the status of Zookeeper before proceeding.'
            'There must be only one leader and the rest must be followers.')
Exemplo n.º 17
0
    def run(self):
        """Reboot all Hadoop workers of a given cluster"""
        if self.workers_cumin_query:
            hadoop_workers = self.spicerack_remote.query(self.cluster_cumin_alias)
            hadoop_workers_override = self.spicerack_remote.query(self.workers_cumin_query)
            hadoop_workers = self.spicerack_remote.query(
                "D{{{}}}".format(hadoop_workers.hosts.intersection(hadoop_workers_override.hosts)))
            ask_confirmation(
                'The user chose to limit the number of Hadoop workers to reboot. '
                'This option does not care about Journal nodes and it will only reboot '
                'hosts following the batch size ({}). This means that more than one Journal node '
                'may potentially be rebooted at the same time. Please check the list of hosts ({}) '
                'before proceeding: {}'.format(self.reboot_batch_size, len(hadoop_workers), hadoop_workers))

            worker_hostnames_n_slices = math.floor(len(hadoop_workers.hosts) / self.reboot_batch_size)
            logger.info('Rebooting Hadoop workers')
            for hadoop_workers_batch in hadoop_workers.split(worker_hostnames_n_slices):
                logger.info("Currently processing: %s", hadoop_workers_batch.hosts)
                self._reboot_hadoop_workers(hadoop_workers_batch)

        else:
            # The test cluster have few worker nodes, all running HDFS Datanodes
            # and Journalnodes, so we need a simpler procedure for this use case.
            if self.cluster != 'test':
                hadoop_workers_no_journal = self.spicerack_remote.query(
                    self.cluster_cumin_alias + ' and not ' + self.hdfs_jn_cumin_alias)

                # Split the workers into batches of hostnames
                worker_hostnames_n_slices = math.floor(len(hadoop_workers_no_journal.hosts) / self.reboot_batch_size)

                logger.info('Rebooting Hadoop workers NOT running a HDFS Journalnode')
                for hadoop_workers_batch in hadoop_workers_no_journal.split(worker_hostnames_n_slices):
                    logger.info("Currently processing: %s", hadoop_workers_batch.hosts)
                    self._reboot_hadoop_workers(hadoop_workers_batch)

            logger.info('Rebooting Hadoop workers running a HDFS Journalnode')
            # Using the following loop to iterate over every HDFS JournalNode
            # one at the time.
            hadoop_hdfs_journal_workers = self.spicerack_remote.query(self.hdfs_jn_cumin_alias)
            for hadoop_workers_batch in hadoop_hdfs_journal_workers.split(len(hadoop_hdfs_journal_workers.hosts)):
                logger.info("Currently processing: %s", hadoop_workers_batch.hosts)
                self._reboot_hadoop_workers(hadoop_workers_batch, stop_journal_daemons=True)

        logger.info('All reboots completed!')
Exemplo n.º 18
0
    def _restart_daemons(self, hosts: RemoteHosts) -> None:
        """Restart daemons on a set of hosts with downtime

        Arguments:
            hosts (`RemoteHosts`): A list of hosts to action

        """
        systemd_cmd = '/bin/systemctl'
        if self._args.ignore_restart_errors:
            # Only restart services which are active
            restart_cmds = [
                f'{systemd_cmd} --quiet is-active {daemon} && {systemd_cmd} restart {daemon} || /bin/true'
                for daemon in self.restart_daemons
            ]
        else:
            restart_cmds = [
                f"{systemd_cmd} restart {' '.join(self.restart_daemons)}"
            ]

        puppet = self._spicerack.puppet(hosts)
        icinga_hosts = self._spicerack.icinga_hosts(hosts.hosts)
        try:
            duration = timedelta(minutes=20)
            with icinga_hosts.downtimed(self.reason, duration=duration):
                now = datetime.utcnow()
                confirm_on_failure(hosts.run_sync, *restart_cmds)
                puppet.run(quiet=True)
                puppet.wait_since(now)
                icinga_hosts.wait_for_optimal()
            self.results.success(hosts.hosts)
        except IcingaError as error:
            ask_confirmation(f'Failed to dowtime hosts: {error}')
            self.logger.warning(error)

        except AbortError as error:
            # Some host failed to come up again, or something fundamental broke.
            # log an error, exit *without* repooling
            self.logger.error(error)
            self.logger.error(
                'Error restarting daemons on: Hosts %s, they may still be depooled',
                hosts,
            )
            self.results.fail(hosts.hosts)
            raise
Exemplo n.º 19
0
    def __init__(self, args, spicerack):
        """Initialize the runner."""
        self.duration = timedelta(days=args.days,
                                  hours=args.hours,
                                  minutes=args.minutes)
        if args.force:
            self.hosts = NodeSet(args.query)
            ask_confirmation(
                f'Will downtime {len(self.hosts)} unverified hosts: {self.hosts}'
            )
        else:
            self.hosts = spicerack.remote().query(args.query).hosts
            if not self.hosts:
                raise RuntimeError(
                    f'No host found for query "{args.query}". Use --force targeting Icinga hosts that '
                    'are not real hosts.')

        self.task_id = args.task_id
        self.icinga_hosts = spicerack.icinga_hosts(self.hosts,
                                                   verbatim_hosts=args.force)
        self.reason = spicerack.admin_reason(args.reason, task_id=args.task_id)

        if args.force_puppet:
            self.puppet = spicerack.puppet(spicerack.icinga_master_host)
        else:
            self.puppet = None

        if args.task_id is not None:
            self.phabricator = spicerack.phabricator(
                PHABRICATOR_BOT_CONFIG_FILE)
        else:
            self.phabricator = None

        if len(self.hosts) <= 5:
            hosts_message = str(self.hosts)
        else:
            hosts_message = f'{len(self.hosts)} hosts'

        self.short_message = f'for {self.duration} on {hosts_message} with reason: {args.reason}'

        self.long_message = (
            f'Icinga downtime set by {self.reason.owner} for {self.duration} {len(self.hosts)} '
            f'host(s) and their services with reason: {args.reason}\n```\n{self.hosts}\n```'
        )
    def _update_local(self, working_dir: Path, message: str) -> None:
        """Update the repo with data from fetch_data.

        Arguments:
            working_dir (pathlib.Path): The temporary directory used to build diffs
            message (str): the commit message

        """
        repo_dir = working_dir / "repo"
        data_dir = working_dir / self._data_subdir
        working_repo = self._repo.clone(repo_dir)
        # Delete all existing files to ensure removal of stale data
        working_repo.git.rm("./", r=True, ignore_unmatch=True)
        # TODO: on python >= 3.8 we can use shutil.copytree with dirs_exist_ok=True
        copy_tree(str(data_dir), str(repo_dir), preserve_symlinks=1)
        self._commit(working_repo, message)
        print(working_repo.git.show(["--color=always", "HEAD"]))
        ask_confirmation(f"Ok to push changes to {self._repo.common_dir}")
        self._push(working_repo)
    def run(self):
        """Reboot all Druid nodes in a given cluster"""
        self.druid_workers.run_async(
            'systemctl --quiet is-active zookeeper && echo stats | nc localhost 2181 | grep Mode || exit 0'
        )

        ask_confirmation(
            'From the output of the last command, please check the status of the'
            ' Zookeeper cluster before proceeding.'
            ' There must be only one leader and the rest must be followers.')

        for host in self.druid_workers.hosts:
            logger.info('Start reboot of druid node %s', host)
            self.reboot_druid_node(host)
            logger.info(
                'Reboot completed for node %s. Waiting 10 minutes for daemons to catch up',
                host)
            sleep(600)

        logger.info('All Druid node reboots completed!')
    def __init__(self, args, spicerack):
        """Initialize the runner."""
        if args.force:
            self.hosts = NodeSet(args.query)
            ask_confirmation(
                f'Will remove downtime for {len(self.hosts)} unverified hosts: {self.hosts}'
            )
        else:
            self.hosts = spicerack.remote().query(args.query).hosts
            if not self.hosts:
                raise RuntimeError(
                    f'No host found for query "{args.query}". Use --force targeting Icinga hosts that '
                    'are not real hosts.')

        self.icinga_hosts = spicerack.icinga_hosts(self.hosts,
                                                   verbatim_hosts=args.force)

        if len(self.hosts) <= 5:
            self.hosts_message = str(self.hosts)
        else:
            self.hosts_message = f'{len(self.hosts)} hosts'
Exemplo n.º 23
0
    def __init__(self, args, spicerack):
        """Initialize the runner."""
        ensure_shell_is_durable()
        self.cluster_cumin_alias = "A:kafka-" + args.cluster
        self.kafka_brokers = spicerack.remote().query(self.cluster_cumin_alias)
        self.icinga_hosts = spicerack.icinga_hosts(self.kafka_brokers.hosts)
        self.admin_reason = spicerack.admin_reason(
            'Roll restart of jvm daemons for openjdk upgrade.')
        self.batch_sleep_seconds = args.batch_sleep_seconds
        self.sleep_before_pref_replica_election = args.sleep_before_pref_replica_election

        ask_confirmation(
            'Please check the Grafana dashboard of the cluster and make sure that '
            'topic partition leaders are well balanced and that all brokers are up and running.'
        )

        if args.sleep_before_pref_replica_election < 900:
            ask_confirmation(
                'The sleep time between a broker restart and kafka preferred-replica-election '
                'is less than 900 seconds. The broker needs some time to recover after a restart. '
                'Are you sure?')
Exemplo n.º 24
0
    def __init__(self, args, spicerack):
        """Create a new Virtual Machine in Ganeti."""
        self.cluster, self.row, self.datacenter = get_locations()[args.location]
        self.hostname = args.hostname
        self.vcpus = args.vcpus
        self.memory = args.memory
        self.network = args.network
        self.disk = args.disk
        self.skip_v6 = args.skip_v6
        self.spicerack = spicerack
        self.netbox = self.spicerack.netbox(read_write=True)
        self.fqdn = make_fqdn(self.hostname, self.network, self.datacenter)
        self.allocated = []  # Store allocated IPs to rollback them on failure
        self.dns_propagated = False  # Whether to run the DNS cookbook on rollback
        self.need_netbox_sync = False  # Whether to sync the VM to Netbox on rollback

        print('Ready to create Ganeti VM {a.fqdn} in the {a.cluster} cluster on row {a.row} with {a.vcpus} vCPUs, '
              '{a.memory}GB of RAM, {a.disk}GB of disk in the {a.network} network.'.format(a=self))
        ask_confirmation('Is this correct?')

        ensure_shell_is_durable()
    def run(self):
        """Change the Hadoop distribution."""
        with self.icinga_hosts.downtimed(self.admin_reason, duration=timedelta(minutes=30)):
            if not self.rollback:
                logger.info(
                    'Saving a snapshot of cdh package names and versions in /root/cdh_package_list '
                    'on all nodes, and removing all packages.')
                confirm_on_failure(
                    self.hadoop_client_hosts.run_sync,
                    "dpkg -l | awk '/ii.*+cdh/ {print $2\" \"}' > /root/cdh_package_list")

            self._remove_packages()

            confirm_on_failure(self.hadoop_client_hosts.run_async, 'apt-get update')

            confirm_on_failure(
                self.hadoop_client_hosts.run_sync, 'apt-cache policy hadoop | grep Candidate')
            ask_confirmation('Please verify that the candidate hadoop package is correct across all nodes.')

            self._install_packages_on_clients()

            logger.info('The procedure is completed.')
    def _install_packages_on_workers(self):
        """Install Hadoop packages on Hadoop worker nodes."""
        logger.info("Install packages on worker nodes (long step).")

        if self.rollback:
            confirm_on_failure(
                self.hadoop_workers.run_sync,
                'apt-get install -y `cat /root/cdh_package_list`',
                batch_size=5, batch_sleep=60.0,
                success_threshold=HADOOP_WORKERS_CUMIN_SUCCESS_THRESHOLD)
        else:
            apt_package_filter = "|".join(CDH_PACKAGES_NOT_IN_BIGTOP)
            confirm_on_failure(
                self.hadoop_workers.run_sync,
                "apt-get install -y `cat /root/cdh_package_list | tr ' ' '\n' | "
                f"egrep -v '{apt_package_filter}' | tr '\n' ' '`",
                batch_size=5, batch_sleep=60.0,
                success_threshold=HADOOP_WORKERS_CUMIN_SUCCESS_THRESHOLD)

        # If the cookbook is running in rollback mode, then there are extra steps to be taken
        # for HDFS Datanodes.
        if self.rollback:
            logger.info('Stop each datanode and start it with the rollback option. Long step.')
            confirm_on_failure(
                self.hadoop_workers.run_async,
                'systemctl unmask hadoop-hdfs-datanode', 'service hadoop-hdfs-datanode rollback',
                batch_size=2, batch_sleep=30.0,
                success_threshold=HADOOP_WORKERS_CUMIN_SUCCESS_THRESHOLD)

        logger.info('Checking how many java daemons are running on the worker nodes '
                    'after installing the packages.')

        confirm_on_failure(
            self.hadoop_workers.run_sync,
            'ps aux | egrep "[j]ava.*(JournalNode|DataNode|NodeManager)" | wc -l',
            success_threshold=HADOOP_WORKERS_CUMIN_SUCCESS_THRESHOLD)
        ask_confirmation('Verify that the count is two for non-journal workers, and 3 for journal workers.')
Exemplo n.º 27
0
    def run(self):
        """Add a new node to a Ganeti cluster."""
        print('Ready to add Ganeti node {} in the {} cluster'.format(
            self.fqdn, self.cluster))
        ask_confirmation('Is this correct?')

        if self.fqdn not in self.remote.query('A:ganeti-all').hosts:
            raise RuntimeError(
                '{} does have not have the Ganeti role applied. Please fix and re-run the cookbook'
                .format(self.fqdn))

        self.validate_state(
            'ls /dev/kvm', 'does have not have virtualisation enabled in BIOS')

        self.validate_state(
            'vgs | grep "ganeti "',
            ('No "ganeti" volume group found. You need to remove the swap device on /dev/md2, '
             'create a PV on /dev/md2 and eventually create a VG named "ganeti". Make sure to '
             'remove the stale swap entry from fstab as well'),
        )

        self.validate_state(
            'brctl show private | grep "en[o|p|s]"',
            'No private bridge configured',
        )

        self.validate_state(
            'brctl show public | grep "en[o|p|s]"',
            'No public bridge configured',
        )

        if self.fqdn in self.remote.query('A:eqiad').hosts:
            self.validate_state(
                'brctl show analytics | grep "en[o|p|s]"',
                'No analytics bridge configured',
            )

        self.master.run_sync(
            'gnt-node add --no-ssh-key-check -g "{group}" "{node}"'.format(
                group=self.group, node=self.fqdn))
        ask_confirmation('Has the node been added correctly?')

        self.master.run_sync('gnt-cluster verify')
        ask_confirmation('Verify that the cluster state looks correct.')

        self.master.run_sync('gnt-cluster verify-disks')
        ask_confirmation('Verify that the disk state looks correct.')
Exemplo n.º 28
0
    def __init__(self, args, spicerack):
        """Initialize the runner."""
        self.debmonitor = spicerack.debmonitor()
        self.removed_hosts = 0
        self.username = spicerack.username
        try:
            self.hosts = spicerack.remote().query(args.query).hosts
        except RemoteError:
            query_hosts = NodeSet(args.query)
            ask_confirmation(
                'Your query did not match any hosts. This can happen if the host\n'
                'record was already removed from Puppetdb, but persists in\n'
                'DebMonitor. Do you want to proceed? The following {l} hosts will be\n'
                'affected: {query_hosts}\n'.
                format(l=len(query_hosts), query_hosts=query_hosts))
            self.hosts = query_hosts

        if args.task_id is not None:
            self.phabricator = spicerack.phabricator(PHABRICATOR_BOT_CONFIG_FILE)
            self.task_id = args.task_id
        else:
            self.phabricator = None

        self.log_message = 'for {n} hosts: {hosts}'.format(n=len(self.hosts), hosts=self.hosts)
    def __init__(self, args, spicerack):
        """Initialize the runner."""
        if args.cluster == 'test':
            self.suffix = '-test'
            self.cluster = 'test'
        elif args.cluster == 'analytics':
            self.suffix = ''
            self.cluster = 'analytics'
        else:
            raise RuntimeError("Hadoop cluster {} not supported.".format(args.cluster))

        ensure_shell_is_durable()

        self.remote = spicerack.remote()
        self.hadoop_master = self.remote.query('A:hadoop-master' + self.suffix)
        self.hadoop_standby = self.remote.query('A:hadoop-standby' + self.suffix)
        self.icinga_hosts = spicerack.icinga_hosts(self.hadoop_master.hosts | self.hadoop_standby.hosts)
        self.admin_reason = spicerack.admin_reason('Restart of jvm daemons.')

        self.yarn_rm_sleep = args.yarn_rm_sleep_seconds
        self.hdfs_nn_sleep = args.hdfs_nn_sleep_seconds

        # Safety checks
        if self.hdfs_nn_sleep < 600:
            ask_confirmation('The HDFS Namenode restart sleep is less than 600s, are you sure?')
        if self.yarn_rm_sleep < 60:
            ask_confirmation('The Yarn Resourcemanager restart sleep is less than 60s, are you sure?')
        if len(self.hadoop_master) != 1:
            raise RuntimeError("Expecting exactly one Hadoop master server. Found: {}".format(self.hadoop_master))
        if len(self.hadoop_standby) != 1:
            raise RuntimeError("Expecting exactly one Hadoop standby server. Found: {}".format(self.hadoop_standby))

        # This is needed due to the format of the hostname in the command, for example:
        # sudo -u hdfs /usr/bin/hdfs haadmin -getServiceState an-master1001-eqiad-wmnet
        self.hadoop_master_service = self.hadoop_master.hosts[0].replace('.', '-')
        self.hadoop_standby_service = self.hadoop_standby.hosts[0].replace('.', '-')

        logger.info('Checking HDFS and Yarn daemon status. We expect active statuses on the Master node, '
                    'and standby statuses on the other. Please do not proceed otherwise.')

        print_hadoop_service_state(
            self.hadoop_master, self.hadoop_master_service, self.hadoop_standby_service)

        ask_confirmation('Please make sure that the active/standby nodes shown are correct.')
Exemplo n.º 30
0
    def __init__(self, args, spicerack):
        """Initiliaze the reimage runner."""
        ensure_shell_is_durable()
        self.args = args
        self.host = self.args.host

        self.netbox = spicerack.netbox()
        self.netbox_server = spicerack.netbox_server(self.host,
                                                     read_write=True)
        self.netbox_data = self.netbox_server.as_dict()

        ask_confirmation(
            f'ATTENTION: destructive action for host: {self.host}\nAre you sure to proceed?'
        )

        # Shortcut variables
        self.fqdn = self.netbox_server.fqdn
        self.mgmt_fqdn = self.netbox_server.mgmt_fqdn
        self.output_filename = self._get_output_filename(spicerack.username)
        self.actions = spicerack.actions
        self.host_actions = self.actions[self.host]
        self.confctl_services = []

        if self.netbox_server.virtual:
            raise RuntimeError(
                f'Host {self.host} is a virtual machine. VMs are not yet supported.'
            )

        self.dns = spicerack.dns()
        self.icinga_host = spicerack.icinga_hosts([self.host])
        self.ipmi = spicerack.ipmi(self.mgmt_fqdn)
        self.reason = spicerack.admin_reason('Host reimage',
                                             task_id=self.args.task_id)
        self.puppet_master = spicerack.puppet_master()
        self.debmonitor = spicerack.debmonitor()
        self.confctl = spicerack.confctl('node')
        self.remote = spicerack.remote()
        self.spicerack = spicerack

        try:
            self.remote_host = self.remote.query(self.fqdn)
            if self.args.new:
                ask_confirmation(
                    f'Host {self.fqdn} was found in PuppetDB but --new was set. Are you sure you want to '
                    'proceed? The --new option will be unset')
                self.args.new = False  # Unset --new
                logger.info('The option --new has been unset')
        except RemoteError as e:
            self.remote_host = self.remote.query(
                f'D{{{self.fqdn}}}')  # Use the Direct backend instead
            if not self.args.new:
                raise RuntimeError(
                    f'Host {self.fqdn} was not found in PuppetDB but --new was not set. Check that the '
                    'FQDN is correct. If the host is new or has disappeared from PuppetDB because down '
                    'for too long use --new.') from e

        if len(self.remote_host) != 1:
            raise RuntimeError(
                f'Expected 1 host for query {self.fqdn} but got {len(self.remote_host)}: {self.remote_host}'
            )

        # The same as self.remote_host but using the SSH key valid only during installation before the first Puppet run
        self.remote_installer = spicerack.remote(installer=True).query(
            self.fqdn)
        # Get a Puppet instance for the current cumin host to update the known hosts file
        remote_localhost = self.remote.query(f'{self.reason.hostname}.*')
        if len(remote_localhost) != 1:
            raise RuntimeError(
                f'Localhost matched the wrong number of hosts ({len(remote_localhost)}) for '
                f'query "{self.reason.hostname}.*": {remote_localhost}')
        self.puppet_localhost = spicerack.puppet(remote_localhost)
        self.puppet = spicerack.puppet(self.remote_host)
        # The same as self.puppet but using the SSH key valid only during installation before the first Puppet run
        self.puppet_installer = spicerack.puppet(self.remote_installer)

        # DHCP automation
        try:
            self.dhcp_hosts = self.remote.query(
                f'A:installserver-light and A:{self.netbox_data["site"]["slug"]}'
            )
        except RemoteError:  # Fallback to eqiad's install server if the above fails, i.e. for a new DC
            self.dhcp_hosts = self.remote.query(
                'A:installserver-light and A:eqiad')
        self.dhcp = spicerack.dhcp(self.dhcp_hosts)
        self.dhcp_config = self._get_dhcp_config()

        self._validate()

        # Keep track of some specific actions for the eventual rollback
        self.rollback_masks = False
        self.rollback_depool = False

        if self.args.task_id is not None:
            self.phabricator = spicerack.phabricator(
                PHABRICATOR_BOT_CONFIG_FILE)
        else:
            self.phabricator = None