예제 #1
0
def main():
    """Run the automated reimaging of a single host."""
    # Setup
    phab_client = None
    args = parse_args()
    lib.ensure_shell_mode()
    user = lib.get_running_user()
    log_path = setup_logging(user, args.host)
    cumin_output_path = log_path.replace('.log', '_cumin.out')
    if args.debug:
        logger.setLevel(logging.DEBUG)

    logger.info('wmf-auto-reimage-host called with args: {args}'.format(args=args))
    lib.print_line('REIMAGE START | To monitor the full log and cumin output:', host=args.host)
    lib.print_line('sudo tail -F {log}'.format(log=log_path), skip_time=True)
    lib.print_line('sudo tail -F {log}'.format(log=cumin_output_path), skip_time=True)

    try:
        lib.ensure_ipmi_password()
        lib.check_remote_ipmi(args.mgmt)
        if args.rename_mgmt:
            lib.check_remote_ipmi(args.rename_mgmt)

        if args.phab_task_id is not None:
            phab_client = lib.get_phabricator_client()
            lib.phabricator_task_update(
                phab_client, args.phab_task_id, lib.PHAB_COMMENT_PRE.format(
                    user=user, hostname=socket.getfqdn(), hosts=args.host, log=log_path))

        try:
            # This is needed due to a bug in tqdm and a limitation in Cumin
            with open(cumin_output_path, 'w', 1) as cumin_output:
                stderr = sys.stderr
                stdout = sys.stdout
                sys.stderr = cumin_output
                sys.stdout = cumin_output
                run(args, user, log_path)
                retcode = 0
        finally:
            sys.stderr = stderr
            sys.stdout = stdout
    except BaseException as e:
        message = 'Unable to run wmf-auto-reimage-host'
        lib.print_line('{message}: {error}'.format(message=message, error=e), host=args.host)
        logger.exception(message)
        retcode = 2
    finally:
        lib.print_line('REIMAGE END | retcode={ret}'.format(ret=retcode), host=args.host)

    # Comment on the Phabricator task
    if args.phab_task_id is not None and phab_client is not None:
        phabricator_message = lib.get_phabricator_post_message({retcode: [args.host]})
        lib.phabricator_task_update(phab_client, args.phab_task_id, phabricator_message)

    return retcode
예제 #2
0
def main():
    """Automated reimaging of a list of hosts."""
    # Setup
    args = parse_args()
    lib.ensure_shell_mode()
    user = lib.get_running_user()
    log_path = setup_logging(user)
    if args.debug:
        logger.setLevel(logging.DEBUG)

    logger.info('wmf-auto-reimage called with args: {args}'.format(args=args))
    lib.print_line('START. To monitor the full log:')
    lib.print_line('sudo tail -F {log}'.format(log=log_path), skip_time=True)

    try:
        retcode = run(args, user, log_path)
    except BaseException as e:
        message = 'Unable to run wmf-auto-reimage'
        lib.print_line('{message}: {error}'.format(message=message, error=e))
        logger.exception(message)
        retcode = 2
    finally:
        lib.print_line('END')

    return retcode
예제 #3
0
def main():
    """Downtime a single host on Icinga."""
    args = parse_args()
    user = lib.get_running_user()
    if args.debug:
        logger.setLevel(logging.DEBUG)

    if args.sleep:
        lib.print_line('Sleeping for {s} seconds'.format(s=args.sleep),
                       host=args.host)
        time.sleep(args.sleep)

    lib.print_line('Running Puppet on the Icinga server', host=args.host)
    try:
        if args.debug:
            lib.run_puppet([lib.resolve_dns(lib.ICINGA_DOMAIN, 'CNAME')],
                           no_raise=True)
            lib.icinga_downtime(args.host,
                                user,
                                args.phab_task_id,
                                title='wmf-downtime-host')
        else:
            # This is needed due to a bug in tqdm and a limitation in Cumin
            with open(os.devnull, 'w', 1) as cumin_output:
                stderr = sys.stderr
                stdout = sys.stdout
                sys.stderr = cumin_output
                sys.stdout = cumin_output
                lib.run_puppet([lib.resolve_dns(lib.ICINGA_DOMAIN, 'CNAME')],
                               no_raise=True)
                lib.icinga_downtime(args.host,
                                    user,
                                    args.phab_task_id,
                                    title='wmf-downtime-host')
        retcode = 0
    except BaseException as e:
        message = 'Unable to run wmf-downtime-host'
        lib.print_line('{message}: {error}'.format(message=message, error=e),
                       host=args.host)
        logger.exception(message)
        retcode = 2
    finally:
        if not args.debug:
            sys.stderr = stderr
            sys.stdout = stdout

    return retcode
예제 #4
0
def run(args, user, log_path):
    """Run the WMF auto reimage according to command line arguments.

    Arguments:
    args     -- parsed command line arguments
    user     -- the user that launched the script, for auditing purposes
    log_path -- the path of the logfile
    """
    previous = None  # Previous state in conftool
    rename_from = None  # In case of host rename, hold the previous hostname

    # Validate hosts have a signed Puppet certificate
    if not args.new:
        lib.validate_hosts([args.host], no_raise=args.no_verify)

    # Set Icinga downtime
    if not args.new and not args.no_downtime:
        lib.icinga_downtime(args.host, user, args.phab_task_id)

    # Depool via conftool
    if args.conftool and not args.new:
        previous = lib.conftool_depool(args.host, pooled=args.conftool_value)
        lib.print_line('Waiting 3 minutes to let the host drain',
                       host=args.host)
        time.sleep(180)

    if args.no_pxe:
        lib.print_line('Skipping PXE reboot', host=args.host)
        if (not lib.validate_hosts([args.host], no_raise=True)
                and lib.puppet_check_cert_to_sign(args.host) == 1):
            # There is no signed or pending signing certificate for the host
            lib.puppet_generate_cert(args.host)
    else:
        lib.puppet_remove_host(args.host)  # Cleanup Puppet

        # Reboot into PXE mode to start the reimage
        lib.set_pxe_boot(args.host, args.mgmt)
        status = lib.ipmitool_command(args.mgmt,
                                      ['chassis', 'power', 'status'])
        if status.startswith('Chassis Power is off'):
            lib.print_line('Current power status is off, powering on',
                           host=args.host)
            ipmi_command = ['chassis', 'power', 'on']
        else:
            lib.print_line('Power cycling', host=args.host)
            ipmi_command = ['chassis', 'power', 'cycle']

        lib.print_line(lib.ipmitool_command(args.mgmt,
                                            ipmi_command).rstrip('\n'),
                       host=args.host)

        # If the host is renamed, swap the hostnames now
        if args.rename is not None:
            rename_from = args.host
            args.host = args.rename
            args.mgmt = args.rename_mgmt

        # Wait that the host is booting into the installer using Cumin's direct backend
        lib.wait_reboot(args.host,
                        start=datetime.utcnow(),
                        installer_key=True,
                        debian_installer=True)
        # Wait for the reboot into the new system
        time.sleep(
            30
        )  # Avoid race conditions, the host is in the d-i, need to wait anyway
        lib.wait_reboot(args.host, start=datetime.utcnow(), installer_key=True)

        # Generate the Puppet certificate and signing request
        if lib.detect_init_system(args.host) == 'systemd':
            lib.puppet_generate_cert(args.host)

    # Sign the new Puppet certificate
    if lib.puppet_wait_cert_and_sign(args.host):
        if args.mask:  # Mask systemd services
            lib.mask_systemd_services(args.host, args.mask)

        lib.puppet_first_run(args.host)
        # Ensure the host is in Icinga
        lib.run_puppet([lib.resolve_dns(lib.ICINGA_DOMAIN, 'CNAME')],
                       no_raise=True)
        lib.icinga_downtime(args.host, user, args.phab_task_id)

    lib.check_bios_bootparams(args.host, args.mgmt)

    # Issue a reboot and wait for it and also for Puppet to complete
    if not args.no_reboot:
        reboot_time = datetime.utcnow()
        # Ensure the host is in the known hosts
        lib.run_puppet([socket.getfqdn()], no_raise=True)
        lib.reboot_host(args.host)
        boot_time = datetime.utcnow()
        lib.wait_reboot(args.host, start=reboot_time)
        lib.wait_puppet_run(args.host, start=boot_time)

    # Run Apache fast test
    if args.apache:
        lib.run_apache_fast_test(args.host)

    # The unmask is *not* done automatically, the commands to unmask are printed and logged
    if args.mask:
        lib.print_unmask_message(args.host, args.mask)

    # The repool is *not* done automatically, the command to repool is printed and logged
    if args.conftool:
        lib.print_repool_message(previous,
                                 rename_from=rename_from,
                                 rename_to=args.rename)

    lib.print_line('Reimage completed', host=args.host)
예제 #5
0
def run(args, user, log_path):
    """Run the WMF auto reimage according to command line arguments.

    Arguments:
    args     -- parsed command line arguments
    user     -- the user that launched the script, for auditing purposes
    log_path -- the path of the logfile
    """
    previous = None  # Previous state in conftool
    rename_from = None  # In case of host rename, hold the previous hostname
    cert_fingerprint = None

    # Validate hosts have a signed Puppet certificate
    if not args.new:
        lib.validate_hosts([args.host], no_raise=args.no_verify)

    # Set Icinga downtime
    if not args.new and not args.no_downtime:
        lib.icinga_downtime(args.host, user, args.phab_task_id)

    # Depool via conftool
    if args.conftool and not args.new:
        previous = lib.conftool_depool(args.host, pooled=args.conftool_value)
        lib.print_line('Waiting 3 minutes to let the host drain',
                       host=args.host)
        time.sleep(180)

    if args.no_pxe:
        lib.print_line('Skipping PXE reboot', host=args.host)
        if not lib.validate_hosts([args.host], no_raise=True):
            # There is no valid cert for the host, remove local cert and re-generate it
            lib.puppet_remove_local_cert(args.host, installer=True)
            ret = lib.puppet_check_cert_to_sign(args.host, lib.CERT_DESTROY)
            if ret != 1:
                raise RuntimeError((
                    'There was an error checking the certificate. puppet_check_cert_to_sign() '
                    'should have returned 1, got {ret} instead').format(
                        ret=ret))

            cert_fingerprint = lib.puppet_generate_cert(args.host)
    else:
        lib.puppet_remove_host(args.host)  # Cleanup Puppet
        lib.debmonitor_remove_host(args.host)

        # Reboot into PXE mode to start the reimage
        lib.set_pxe_boot(args.host, args.mgmt)
        status = lib.ipmitool_command(args.mgmt,
                                      ['chassis', 'power', 'status'])
        if status.startswith('Chassis Power is off'):
            lib.print_line('Current power status is off, powering on',
                           host=args.host)
            ipmi_command = ['chassis', 'power', 'on']
        else:
            lib.print_line('Power cycling', host=args.host)
            ipmi_command = ['chassis', 'power', 'cycle']

        lib.print_line(lib.ipmitool_command(args.mgmt,
                                            ipmi_command).rstrip('\n'),
                       host=args.host)

        # If the host is renamed, swap the hostnames now
        if args.rename is not None:
            rename_from = args.host
            args.host = args.rename
            args.mgmt = args.rename_mgmt

        # Wait that the host is booting into the installer using Cumin's direct backend
        lib.wait_reboot(args.host,
                        start=datetime.utcnow(),
                        installer_key=True,
                        debian_installer=True)
        # Wait for the reboot into the new system
        time.sleep(
            30
        )  # Avoid race conditions, the host is in the d-i, need to wait anyway
        lib.wait_reboot(args.host, start=datetime.utcnow(), installer_key=True)

        # Generate the Puppet certificate and signing request
        cert_fingerprint = lib.puppet_generate_cert(args.host)

    # Sign the new Puppet certificate
    if lib.puppet_wait_cert_and_sign(args.host, cert_fingerprint):
        if args.mask:  # Mask systemd services
            lib.mask_systemd_services(args.host, args.mask)

        # Downtime the host on Icinga with delay to give time to compile the Puppet catalog
        # and export its resources
        downtime_debug = ''
        if args.debug:
            downtime_debug = '-v '
        downtime_command = (
            '/bin/sleep 120 && /usr/bin/cookbook {verbose}sre.hosts.downtime '
            '--force-puppet -H 2 -r REIMAGE {host}').format(
                verbose=downtime_debug, host=args.host)

        downtime = subprocess.Popen(downtime_command, shell=True)
        lib.print_line('Scheduled delayed downtime on Icinga', host=args.host)

        lib.puppet_first_run(args.host)

        try:
            downtime.wait(timeout=180)
            downtime_success = (downtime.returncode == 0)
            downtime_message = 'returned {ret}'.format(ret=downtime.returncode)
        except subprocess.TimeoutExpired:
            downtime_success = False
            downtime_message = 'timed out'

        if not downtime_success:
            lib.print_line((
                'WARNING: failed to downtime host on Icinga, wmf-downtime-host '
                '{msg}').format(msg=downtime_message))

    lib.check_bios_bootparams(args.host, args.mgmt)

    # Issue a reboot and wait for it and also for Puppet to complete
    if not args.no_reboot:
        reboot_time = datetime.utcnow()
        # Ensure the host is in the known hosts
        lib.run_puppet([socket.getfqdn()], no_raise=True)
        lib.reboot_host(args.host)
        boot_time = datetime.utcnow()
        lib.wait_reboot(args.host, start=reboot_time)
        lib.wait_puppet_run(args.host, start=boot_time)

    # Run httpbb
    if args.apache:
        lib.run_httpbb(args.host)

    # The unmask is *not* done automatically, the commands to unmask are printed and logged
    if args.mask:
        lib.print_unmask_message(args.host, args.mask)

    # The repool is *not* done automatically, the command to repool is printed and logged
    if args.conftool:
        lib.print_repool_message(previous,
                                 rename_from=rename_from,
                                 rename_to=args.rename)

    lib.print_line('Reimage completed', host=args.host)
예제 #6
0
def run(args, user, log_path):
    """Run the reimage for all the hosts in subproceesses."""
    # Setup
    phab_client = lib.get_phabricator_client()
    lib.ensure_ipmi_password()
    mgmts = lib.get_mgmts(args.hosts)

    # Check that IPMI is working for all the hosts
    for host in args.hosts:
        lib.check_remote_ipmi(mgmts[host])

    # Initialize data structures
    procs = {}
    retcodes = defaultdict(list)

    # Validate hosts
    if not args.new:
        lib.validate_hosts(args.hosts, no_raise=args.no_verify)

    # Update the Phabricator task
    if args.phab_task_id is not None:
        lib.phabricator_task_update(
            phab_client, args.phab_task_id,
            lib.PHAB_COMMENT_PRE.format(user=user,
                                        hostname=socket.getfqdn(),
                                        hosts=args.hosts,
                                        log=log_path))

    # Run the reimage for each host in a child process
    try:
        for host in args.hosts:
            proc = reimage_host(host, mgmts[host], args)
            if args.sequential:
                ret = proc.wait()
                retcodes[ret].append(host)
                time.sleep(args.sleep)
            else:
                procs[host] = proc
                lib.print_line(
                    'Splaying the start of the next reimage by 2 minutes')
                time.sleep(120)

        if procs:
            retcodes = wait_for_childrens(procs)
    except KeyboardInterrupt:
        # Terminate childrens
        if procs:
            for process in procs:
                process.terminate()
        else:
            proc.terminate()

        raise

    # Comment on the Phabricator task
    if args.phab_task_id is not None:
        phabricator_message = lib.get_phabricator_post_message(retcodes)
        lib.phabricator_task_update(phab_client, args.phab_task_id,
                                    phabricator_message)

    if max(retcodes.keys()) > 0:
        return 1

    return 0