def __init__(self, args, spicerack): """Initialize the runner""" if args.cluster is not None: self.query = 'A:{}'.format(args.cluster) else: self.query = args.query ensure_shell_is_durable() self.cassandra_nodes = spicerack.remote().query(self.query) self.icinga_hosts = spicerack.icinga_hosts(self.cassandra_nodes.hosts) self.reason = spicerack.admin_reason(args.reason) self.instance_sleep_seconds = args.instance_sleep_seconds self.batch_sleep_seconds = args.batch_sleep_seconds logger.info( 'Checking that all Cassandra nodes are reported up by their systemd unit status.' ) # perhaps we should create a c-foreach-status script? # See also https://phabricator.wikimedia.org/T229916 status_cmd = """\ STRING=''; \ for i in $(c-ls) ; do STRING="${STRING} cassandra-${i}" ; done ; \ systemctl status $STRING\ """ self.cassandra_nodes.run_sync(status_cmd)
def __init__(self, args, spicerack): """Change Hadoop distribution on all the clients of a given cluster""" if args.cluster == 'test': cumin_labels = HADOOP_TEST_CLIENT_CUMIN_ALIASES elif args.cluster == 'analytics': cumin_labels = HADOOP_CLIENT_CUMIN_ALIASES else: raise RuntimeError("Hadoop cluster {} not supported.".format(args.cluster)) ensure_shell_is_durable() spicerack_remote = spicerack.remote() if args.cumin_client_label: if args.cumin_client_label not in cumin_labels: raise RuntimeError( "Cumin label {} not supported. Please use one of: {}" .format(args.cumin_client_label, cumin_labels)) cumin_labels = [args.cumin_client_label] self.hadoop_client_hosts = spicerack_remote.query(' or '.join(cumin_labels)) self.icinga_hosts = spicerack.icinga_hosts(self.hadoop_client_hosts.hosts) self.admin_reason = spicerack.admin_reason('Change Hadoop distribution') self.rollback = args.rollback self.cluster = args.cluster ask_confirmation( "This cookbook assumes that the Hadoop cluster runs already the new distro, " "please do not proceed otherwise.")
def __init__(self, args, spicerack): """Initialize an Hadoop worker.""" self.success_percent_cumin = args.success_percent / 100 self.skip_disks = args.skip_disks self.disks_number = args.disks_number self.hostname_pattern = args.hostname_pattern self.partitions_basedir = args.partitions_basedir self.wipe_partitions = args.wipe_partitions self.hadoop_workers = spicerack.remote().query(self.hostname_pattern) letters = list(string.ascii_lowercase) if len(letters[self.skip_disks:]) < self.disks_number: raise RuntimeError( 'The number of available letters is not enough to support {} disks, ' 'please check your parameters:\n{}'.format( self.disks_number, letters[self.skip_disks:])) self.available_disk_labels = letters[self. skip_disks:self.disks_number + self.skip_disks] ask_confirmation( 'Please check that the hosts to initialize are the expected ones: {}' .format(self.hadoop_workers.hosts)) ask_confirmation( 'Please check that the disk labels to act on are the expected ' 'ones: {}'.format(str(self.available_disk_labels))) ensure_shell_is_durable()
def test_ensure_shell_is_durable_interactive(mocked_isatty): """Should raise WmflibError if in an interactive shell.""" mocked_isatty.return_value = True with pytest.raises(WmflibError, match='Must be run in non-interactive mode or inside a screen or tmux.'): interactive.ensure_shell_is_durable() assert mocked_isatty.called
def run(args, spicerack): """Required by Spicerack API.""" ensure_shell_is_durable() session = Session() session.verify = False return_code = 0 current_password = get_secret('Current password') new_password = get_secret("New password", confirm=True) session.auth = (args.username, current_password) _pdus = pdus.get_pdu_ips(spicerack.netbox(), args.query) for pdu in _pdus: try: if not spicerack.dry_run: change_password(pdu, session, new_password) else: logger.info('%s: Dry run, not trying.', pdu) if args.check_default: if pdus.check_default(pdu, session): # TODO: delete default user return_code = 1 except (pdus.VersionError, PasswordResetError) as error: logger.error(error) return_code = 1 return return_code
def __init__(self, args, spicerack): """Restart Presto on a given cluster.""" ensure_shell_is_durable() self.cluster = args.cluster self.presto_workers = spicerack.remote().query("A:presto-" + self.cluster) self.icinga_hosts = spicerack.icinga_hosts(self.presto_workers.hosts) self.admin_reason = spicerack.admin_reason('Roll restart of all Presto\'s jvm daemons.')
def __init__(self, args, spicerack): """Restart ORES daemons on a given cluster.""" cluster_cumin_alias = "A:ores-" + args.cluster self.cluster = args.cluster self.ores_workers = spicerack.remote().query(cluster_cumin_alias) self.icinga_hosts = spicerack.icinga_hosts(self.ores_workers.hosts) self.admin_reason = spicerack.admin_reason('Roll restart of ORES\'s daemons.') self.daemons = args.daemons self.spicerack = spicerack self.confctl = spicerack.confctl('node') ensure_shell_is_durable()
def __init__(self, args, spicerack): """Initialize the runner.""" ensure_shell_is_durable() self.cluster = args.cluster self.remote = spicerack.remote() self.confctl = spicerack.confctl('node') self.aqs_canary = self.remote.query('A:' + args.cluster + '-canary') self.aqs_workers = self.remote.query('A:' + args.cluster) self.icinga_hosts = spicerack.icinga_hosts(self.aqs_workers.hosts) self.admin_reason = spicerack.admin_reason( 'Roll restart of all AQS\'s nodejs daemons.')
def __init__(self, args, spicerack): """Initialize the runner""" if args.cluster == 'test': self.cluster_cumin_alias = 'A:hadoop-worker-test' self.hdfs_jn_cumin_alias = 'A:hadoop-hdfs-journal-test' elif args.cluster == 'analytics': self.cluster_cumin_alias = 'A:hadoop-worker' self.hdfs_jn_cumin_alias = 'A:hadoop-hdfs-journal' else: raise RuntimeError("Hadoop cluster {} not supported.".format( args.cluster)) ensure_shell_is_durable() self.cluster = args.cluster self.hadoop_workers = spicerack.remote().query( self.cluster_cumin_alias) self.hadoop_hdfs_journal_workers = spicerack.remote().query( self.hdfs_jn_cumin_alias) self.icinga_hosts = spicerack.icinga_hosts(self.hadoop_workers.hosts) self.admin_reason = spicerack.admin_reason( 'Roll restart of jvm daemons for openjdk upgrade.') self.yarn_nm_batch_size = args.yarn_nm_batch_size self.yarn_nm_sleep = args.yarn_nm_sleep_seconds # Not configurable on purpose, too risky! self.hdfs_jn_batch_size = 1 self.hdfs_jn_sleep = args.hdfs_jn_sleep_seconds self.hdfs_dn_batch_size = args.hdfs_dn_batch_size self.hdfs_dn_sleep = args.hdfs_dn_sleep_seconds # Safety checks if self.hdfs_dn_batch_size > 5: ask_confirmation( 'The HDFS Datanode batch size is bigger than 5, are you sure?') if self.hdfs_dn_sleep < 20: ask_confirmation( 'The HDFS Datanode sleep between each batch is less than 20s, are you sure?' ) if self.hdfs_jn_sleep < 20: ask_confirmation( 'The HDFS Journalnode sleep between each batch is less than 20s, are you sure?' ) if self.yarn_nm_batch_size > 10: ask_confirmation( 'The Yarn Nodemanager batch size is bigger than 10, are you sure?' ) if self.yarn_nm_sleep < 20: ask_confirmation( 'The Yarn Nodemanager sleep between each batch is less than 20s, are you sure?' )
def __init__(self, args, spicerack): """Restart druid daemons on a given cluster.""" cluster_cumin_alias = "A:druid-" + args.cluster self.need_depool = False if args.cluster == 'public': self.need_depool = True self.cluster = args.cluster self.druid_workers = spicerack.remote().query(cluster_cumin_alias) self.icinga_hosts = spicerack.icinga_hosts(self.druid_workers.hosts) self.admin_reason = spicerack.admin_reason( 'Roll restart of Druid jvm daemons.') self.daemons = args.daemons ensure_shell_is_durable()
def __init__(self, args, spicerack): """Reboot Presto on a given cluster.""" ensure_shell_is_durable() self.icinga_hosts = spicerack.icinga_hosts self.puppet = spicerack.puppet self.admin_reason = spicerack.admin_reason('Reboot Presto nodes') self.remote = spicerack.remote() self.cluster = args.cluster cluster_cumin_alias = 'A:presto-' + self.cluster self.presto_workers = self.remote.query(cluster_cumin_alias)
def __init__(self, args, spicerack): """Change Hadoop distribution on a given cluster""" if args.cluster == 'test': suffix = '-test' elif args.cluster == 'analytics': suffix = '' else: raise RuntimeError("Hadoop cluster {} not supported.".format(args.cluster)) ensure_shell_is_durable() spicerack_remote = spicerack.remote() self.hadoop_hosts = spicerack_remote.query(CLUSTER_CUMIN_ALIAS + suffix) self.hadoop_hdfs_journal_workers = spicerack_remote.query(HDFS_JOURNAL_CUMIN_ALIAS + suffix) if args.journalnodes_cumin_query: hadoop_hdfs_journal_override = spicerack_remote.query(args.journalnodes_cumin_query) self.hadoop_hdfs_journal_workers = spicerack_remote.query( "D{{{}}}".format( self.hadoop_hdfs_journal_workers.hosts.intersection(hadoop_hdfs_journal_override.hosts))) ask_confirmation( 'The cookbook will run only on the following journal hosts ({}), please verify that ' 'the list looks correct: {}' .format(len(self.hadoop_hdfs_journal_workers), self.hadoop_hdfs_journal_workers)) self.hadoop_workers = spicerack_remote.query(WORKERS_CUMIN_ALIAS + suffix) if args.workers_cumin_query: hadoop_workers_override = spicerack_remote.query(args.workers_cumin_query) self.hadoop_workers = spicerack_remote.query( "D{{{}}}".format(self.hadoop_workers.hosts.intersection(hadoop_workers_override.hosts))) ask_confirmation( 'The cookbook will run only on the following worker hosts ({}), please verify that ' 'the list looks correct: {}' .format(len(self.hadoop_workers), self.hadoop_workers)) self.hadoop_master = spicerack_remote.query(MASTER_CUMIN_ALIAS + suffix) self.hadoop_standby = spicerack_remote.query(STANDBY_CUMIN_ALIAS + suffix) self.icinga_hosts = spicerack.icinga_hosts(self.hadoop_hosts.hosts) self.admin_reason = spicerack.admin_reason('Change Hadoop distribution') self.rollback = args.rollback self.cluster = args.cluster self.apt_install_options = '-y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold"' # Workaround needed for https://issues.apache.org/jira/browse/YARN-8310 self.yarn_metadata_cleanup_commands = [ f'setAcl /yarn-rmstore/analytics{suffix}-hadoop/ZKRMStateRoot world:anyone:cdrwa', f'rmr /yarn-rmstore/analytics{suffix}-hadoop/ZKRMStateRoot']
def __init__(self, args, spicerack): """Reboot kafka on a given cluster.""" ensure_shell_is_durable() self.icinga_hosts = spicerack.icinga_hosts self.admin_reason = spicerack.admin_reason('Reboot kafka nodes') self.puppet = spicerack.puppet self.remote = spicerack.remote() self.sleep_before_pref_replica_election = args.sleep_before_pref_replica_election self.batch_sleep_seconds = args.batch_sleep_seconds self.cluster = args.cluster cluster_cumin_alias = "A:kafka-" + args.cluster self.kafka_brokers = self.remote.query(cluster_cumin_alias)
def __init__(self, args, spicerack): """Decommission a host from all inventories.""" ensure_shell_is_durable() self.remote = spicerack.remote() try: self.decom_hosts = self.remote.query(args.query).hosts except RemoteError: logger.debug("Query '%s' did not match any host or failed", args.query, exc_info=True) decom_hosts = NodeSet(args.query) ask_confirmation( 'ATTENTION: the query does not match any host in PuppetDB or failed\n' 'Hostname expansion matches {n} hosts: {hosts}\n' 'Do you want to proceed anyway?'.format(n=len(decom_hosts), hosts=decom_hosts)) self.decom_hosts = decom_hosts if len(self.decom_hosts) > 20: raise RuntimeError( 'Matched {} hosts, aborting. (max 20 with --force, 5 without)'. format(len(self.decom_hosts))) if len(self.decom_hosts) > 5: if args.force: logger.info( 'Authorized decommisioning of %s hosts with --force', len(self.decom_hosts)) else: raise RuntimeError( 'Matched {} hosts, and --force not set aborting. (max 20 with --force, 5 without)' .format(len(self.decom_hosts))) ask_confirmation( 'ATTENTION: destructive action for {n} hosts: {hosts}\nAre you sure to proceed?' .format(n=len(self.decom_hosts), hosts=self.decom_hosts)) self.spicerack = spicerack self.task_id = args.task_id self.puppet_master = self.remote.query(get_puppet_ca_hostname()) self.kerberos_kadmin = self.remote.query(KERBEROS_KADMIN_CUMIN_ALIAS) self.dns = self.spicerack.dns() self.deployment_host = self.remote.query( self.dns.resolve_cname(DEPLOYMENT_HOST)) self.patterns = get_grep_patterns(self.dns, self.decom_hosts) self.reason = self.spicerack.admin_reason('Host decommission', task_id=self.task_id)
def __init__(self, args, spicerack): """Reboot Druid on a given cluster.""" ensure_shell_is_durable() self.icinga_hosts = spicerack.icinga_hosts # Store the method to be called on each host self.puppet = spicerack.puppet self.spicerack = spicerack self.admin_reason = spicerack.admin_reason('Reboot Druid nodes') self.remote = spicerack.remote() self.cluster = args.cluster cluster_cumin_alias = 'A:druid-' + self.cluster self.druid_workers = self.remote.query(cluster_cumin_alias) self.need_depool = self.cluster == 'public'
def __init__(self, args, spicerack): """Upgrade MySQL on a given set of hosts.""" ensure_shell_is_durable() self.icinga_hosts = spicerack.icinga_hosts self.admin_reason = spicerack.admin_reason('MySQL upgrade') self.remote = spicerack.remote() query = 'P{' + args.query + '} and A:db-all and not A:db-multiinstance' self.hosts = spicerack.remote().query(query) self.puppet = spicerack.puppet self.logger = logging.getLogger(__name__) if not self.hosts: print('No hosts have been found, exiting') if len(self.hosts) <= 5: self.hosts_message = str(self.hosts) else: self.hosts_message = f'{len(self.hosts)} hosts'
def __init__(self, args, spicerack): """Initialize the runner""" ensure_shell_is_durable() self.cluster_cumin_alias = "A:zookeeper-" + args.cluster self.zookeeper = spicerack.remote().query(self.cluster_cumin_alias) self.icinga_hosts = spicerack.icinga_hosts(self.zookeeper.hosts) self.admin_reason = spicerack.admin_reason( 'Roll restart of jvm daemons.') self.batch_sleep_seconds = args.batch_sleep_seconds # Safety checks self.zookeeper.run_sync('echo stats | nc -q 1 localhost 2181') logger.info('\n=========================================\n') ask_confirmation( 'Please check the status of Zookeeper before proceeding.' 'There must be only one leader and the rest must be followers.')
def __init__(self, args, spicerack): """Add a new node to a Ganeti cluster.""" self.cluster, self.row, self.datacenter = get_locations()[ args.location] ganeti = spicerack.ganeti() self.remote = spicerack.remote() self.master = self.remote.query(ganeti.rapi(self.cluster).master) self.remote_host = self.remote.query(args.fqdn) self.fqdn = args.fqdn self.group = args.group ensure_shell_is_durable() if len(self.remote_host) == 0: raise RuntimeError('Specified server not found, bailing out') if len(self.remote_host) != 1: raise RuntimeError('Only a single server can be added at a time')
def __init__(self, args, spicerack): """Initialize the runner.""" if args.cluster == 'test': self.suffix = '-test' self.cluster = 'test' elif args.cluster == 'analytics': self.suffix = '' self.cluster = 'analytics' else: raise RuntimeError("Hadoop cluster {} not supported.".format(args.cluster)) ensure_shell_is_durable() self.remote = spicerack.remote() self.hadoop_master = self.remote.query('A:hadoop-master' + self.suffix) self.hadoop_standby = self.remote.query('A:hadoop-standby' + self.suffix) self.icinga_hosts = spicerack.icinga_hosts(self.hadoop_master.hosts | self.hadoop_standby.hosts) self.admin_reason = spicerack.admin_reason('Restart of jvm daemons.') self.yarn_rm_sleep = args.yarn_rm_sleep_seconds self.hdfs_nn_sleep = args.hdfs_nn_sleep_seconds # Safety checks if self.hdfs_nn_sleep < 600: ask_confirmation('The HDFS Namenode restart sleep is less than 600s, are you sure?') if self.yarn_rm_sleep < 60: ask_confirmation('The Yarn Resourcemanager restart sleep is less than 60s, are you sure?') if len(self.hadoop_master) != 1: raise RuntimeError("Expecting exactly one Hadoop master server. Found: {}".format(self.hadoop_master)) if len(self.hadoop_standby) != 1: raise RuntimeError("Expecting exactly one Hadoop standby server. Found: {}".format(self.hadoop_standby)) # This is needed due to the format of the hostname in the command, for example: # sudo -u hdfs /usr/bin/hdfs haadmin -getServiceState an-master1001-eqiad-wmnet self.hadoop_master_service = self.hadoop_master.hosts[0].replace('.', '-') self.hadoop_standby_service = self.hadoop_standby.hosts[0].replace('.', '-') logger.info('Checking HDFS and Yarn daemon status. We expect active statuses on the Master node, ' 'and standby statuses on the other. Please do not proceed otherwise.') print_hadoop_service_state( self.hadoop_master, self.hadoop_master_service, self.hadoop_standby_service) ask_confirmation('Please make sure that the active/standby nodes shown are correct.')
def __init__(self, args, spicerack): """Reboot all workers of a given Hadoop cluster.""" if args.cluster == 'test': self.cluster_cumin_alias = 'A:hadoop-worker-test' self.hdfs_jn_cumin_alias = 'A:hadoop-hdfs-journal-test' elif args.cluster == 'analytics': self.cluster_cumin_alias = 'A:hadoop-worker' self.hdfs_jn_cumin_alias = 'A:hadoop-hdfs-journal' else: raise RuntimeError("Hadoop cluster {} not supported.".format(args.cluster)) ensure_shell_is_durable() self.cluster = args.cluster self.spicerack_remote = spicerack.remote() self.spicerack = spicerack self.reboot_batch_size = args.batch_size self.yarn_nm_sleep_seconds = args.yarn_nm_sleep_seconds self.workers_cumin_query = args.workers_cumin_query self.reason = spicerack.admin_reason('Reboot.')
def __init__(self, args, spicerack): """Create a new Virtual Machine in Ganeti.""" self.cluster, self.row, self.datacenter = get_locations()[args.location] self.hostname = args.hostname self.vcpus = args.vcpus self.memory = args.memory self.network = args.network self.disk = args.disk self.skip_v6 = args.skip_v6 self.spicerack = spicerack self.netbox = self.spicerack.netbox(read_write=True) self.fqdn = make_fqdn(self.hostname, self.network, self.datacenter) self.allocated = [] # Store allocated IPs to rollback them on failure self.dns_propagated = False # Whether to run the DNS cookbook on rollback self.need_netbox_sync = False # Whether to sync the VM to Netbox on rollback print('Ready to create Ganeti VM {a.fqdn} in the {a.cluster} cluster on row {a.row} with {a.vcpus} vCPUs, ' '{a.memory}GB of RAM, {a.disk}GB of disk in the {a.network} network.'.format(a=self)) ask_confirmation('Is this correct?') ensure_shell_is_durable()
def __init__(self, args: Namespace, spicerack: Spicerack) -> None: """Initialize the runner.""" ensure_shell_is_durable() if args.alias and args.alias not in self.allowed_aliases: raise ValueError( f"Alias ({args.alias}) does not match allowed aliases: " + ', '.join(self.allowed_aliases)) self._args = args self.query = self._query() self.hosts = spicerack.remote().query(self.query) if not self.hosts: raise ValueError(f'Cumin query ({self.query}) matched zero hosts') self.number_of_batches = ceil(len(self.hosts.hosts) / args.batchsize) self.results = Results(action=args.action, hosts=self.hosts.hosts) reason = f'{args.action} {self.hosts.hosts}: {args.reason}' self.reason = spicerack.admin_reason(reason, args.task_id) self._spicerack = spicerack self.logger = getLogger('.'.join( (self.__module__, self.__class__.__name__)))
def __init__(self, args, spicerack): """Initialize the runner.""" ensure_shell_is_durable() self.cluster_cumin_alias = "A:kafka-" + args.cluster self.kafka_brokers = spicerack.remote().query(self.cluster_cumin_alias) self.icinga_hosts = spicerack.icinga_hosts(self.kafka_brokers.hosts) self.admin_reason = spicerack.admin_reason( 'Roll restart of jvm daemons for openjdk upgrade.') self.batch_sleep_seconds = args.batch_sleep_seconds self.sleep_before_pref_replica_election = args.sleep_before_pref_replica_election ask_confirmation( 'Please check the Grafana dashboard of the cluster and make sure that ' 'topic partition leaders are well balanced and that all brokers are up and running.' ) if args.sleep_before_pref_replica_election < 900: ask_confirmation( 'The sleep time between a broker restart and kafka preferred-replica-election ' 'is less than 900 seconds. The broker needs some time to recover after a restart. ' 'Are you sure?')
def run(args, spicerack): """Required by Spicerack API.""" if spicerack.dry_run: logger.info('this cookbook does nothing with with --dry-run') return 0 ensure_shell_is_durable() session = Session() session.verify = False return_code = 0 current_password = get_secret('Current password') session.auth = (args.username, current_password) _pdus = pdus.get_pdu_ips(spicerack.netbox(), args.query) for pdu in _pdus: try: if args.since: uptime = pdus.parse_uptime(pdus.get_uptime(pdu, session)) if uptime < args.since: logger.info('%s: Not rebooting uptime is %d', pdu, uptime) continue reboot_time = datetime.utcnow() version = pdus.get_version(pdu, session) pdus.reboot(pdu, version, session) # Reboots from expereince take at least 60 seconds logger.info('%s: sleep while reboot', pdu) sleep(60) pdus.wait_reboot_since(pdu, reboot_time, session) except (pdus.VersionError, pdus.RebootError, pdus.UptimeError) as error: logger.error(error) return_code = 1 if args.check_default: if pdus.check_default(pdu, session): # TODO: delete default user return_code = 1 return return_code
def run(args, spicerack): """Required by Spicerack API.""" ensure_shell_is_durable() return_code = 0 session = Session() session.verify = False password = get_secret('Enter login password') snmp_ro = get_secret('New SNMP RO String', confirm=True) session.auth = (args.username, password) _pdus = pdus.get_pdu_ips(spicerack.netbox(), args.query) for pdu in _pdus: snmp_rw = random_string() if args.reset_rw else None try: if not spicerack.dry_run: version = pdus.get_version(pdu, session) if change_snmp(pdu, version, session, snmp_ro, snmp_rw, args.force): reboot_time = datetime.utcnow() pdus.reboot(pdu, version, session) # Reboots from experience take at least 60 seconds logger.info('%s: sleep while reboot', pdu) sleep(60) pdus.wait_reboot_since(pdu, reboot_time, session) else: logger.info('%s: Dry run, not trying.', pdu) if args.check_default: if pdus.check_default(pdu, session): # TODO: delete default user pass except (pdus.VersionError, SnmpResetError, pdus.RebootError) as error: logger.error(error) return_code = 1 return return_code
def __init__(self, args, spicerack): """Initiliaze the provision runner.""" ensure_shell_is_durable() self.args = args self.netbox = spicerack.netbox() self.netbox_server = spicerack.netbox_server(self.args.host) self.netbox_data = self.netbox_server.as_dict() self.fqdn = self.netbox_server.mgmt_fqdn self.ipmi = spicerack.ipmi(self.fqdn) self.remote = spicerack.remote() if self.netbox_server.virtual: raise RuntimeError( f'Host {self.args.host} is a virtual machine. VMs are not supported.' ) if self.netbox_data['device_type']['manufacturer']['slug'] != 'dell': vendor = self.netbox_data['device_type']['manufacturer']['name'] raise RuntimeError( f'Host {self.args.host} manufacturer is {vendor}. Only Dell is supported.' ) if self.netbox_server.status == 'active' and (not self.args.no_dhcp or not self.args.no_users): raise RuntimeError( f'Host {self.args.host} has active status in Netbox but --no-dhcp and --no-users were not set.' ) # DHCP automation try: self.dhcp_hosts = self.remote.query( f'A:installserver-light and A:{self.netbox_data["site"]["slug"]}' ) except RemoteError: # Fallback to eqiad's install server if the above fails, i.e. for a new DC self.dhcp_hosts = self.remote.query( 'A:installserver-light and A:eqiad') self.dhcp = spicerack.dhcp(self.dhcp_hosts) address = self.netbox.api.ipam.ip_addresses.get( dns_name=self.fqdn).address self.interface = ipaddress.ip_interface(address) self.dhcp_config = DHCPConfMgmt( datacenter=self.netbox_data['site']['slug'], serial=self.netbox_data['serial'], fqdn=self.fqdn, ipv4=self.interface.ip, ) if self.args.no_users: password = '' # nosec else: password = DELL_DEFAULT if self.netbox_server.status in ('active', 'staged'): self.reboot_policy = DellSCPRebootPolicy.GRACEFUL else: self.reboot_policy = DellSCPRebootPolicy.FORCED self.redfish = spicerack.redfish(self.fqdn, 'root', password) self.mgmt_password = spicerack.management_password # Testing that the management password is correct connecting to the current cumin host localhost = gethostname() netbox_localhost = spicerack.netbox_server(localhost) try: spicerack.redfish(netbox_localhost.mgmt_fqdn, 'root').check_connection() except RedfishError: raise RuntimeError( f'The management password provided seems incorrect, it does not work on {localhost}.' ) from None self.config_changes = { 'BIOS.Setup.1-1': { 'BootMode': 'Bios', 'CpuInterconnectBusLinkPower': 'Enabled', 'EnergyPerformanceBias': 'BalancedPerformance', 'InternalUsb': 'Off', 'PcieAspmL1': 'Enabled', 'ProcC1E': 'Enabled', 'ProcCStates': 'Enabled', 'ProcPwrPerf': 'OsDbpm', 'ProcVirtualization': 'Enabled' if self.args.enable_virtualization else 'Disabled', 'ProcX2Apic': 'Disabled', 'SerialComm': 'OnConRedirCom2', 'SerialPortAddress': 'Serial1Com1Serial2Com2', 'SysProfile': 'PerfPerWattOptimizedOs', 'UncoreFrequency': 'DynamicUFS', 'UsbPorts': 'OnlyBackPortsOn', }, 'iDRAC.Embedded.1': { 'IPMILan.1#Enable': 'Enabled', 'IPv4.1#DHCPEnable': 'Disabled', 'IPv4Static.1#Address': str(self.interface.ip), 'IPv4Static.1#DNS1': DNS_ADDRESS, 'IPv4Static.1#Gateway': str(next(self.interface.network.hosts())), 'IPv4Static.1#Netmask': str(self.interface.netmask), 'NICStatic.1#DNSDomainFromDHCP': 'Disabled', }, 'System.Embedded.1': { 'ServerPwr.1#PSRapidOn': 'Disabled', } } netbox_host = self.netbox.api.dcim.devices.get(name=self.args.host) self.multi_gigabit = False if 'gbase-' in netbox_host.primary_ip.assigned_object.type.value: logger.info( 'Detected multi-gigabit interface, will add specific settings.' ) self.multi_gigabit = True ask_confirmation( f'Are you sure to proceed to apply BIOS/iDRAC settings {self.runtime_description}?' )
def test_ensure_shell_is_durable_non_interactive(mocked_isatty): """Should raise WmflibError if in an interactive shell.""" mocked_isatty.return_value = False interactive.ensure_shell_is_durable() assert mocked_isatty.called
def run(args, spicerack): # pylint: disable=too-many-return-statements """Required by Spicerack API.""" ensure_shell_is_durable() logger.info('Get source image checksum') dns = spicerack.dns() image_server = dns.resolve_ptr(dns.resolve_ipv4('apt.wikimedia.org')[0])[0] remote = spicerack.remote() image_server = remote.query(image_server) cmd = "sha1sum /srv/junos/{} | cut -d' ' -f1".format(args.image) results = image_server.run_sync(cmd) for _, output in results: src_checksum = output.message().decode() break if len(src_checksum) != 40: logger.info(src_checksum) logger.error('Can\'t checksum, is the file there and readable?') return 1 device = remote.query('D{' + args.fqdn + '}') if len(device.hosts) > 1: logger.error('Only 1 target device please.') return 1 logger.info('Cleanup device storage') results = device.run_sync( 'request system storage cleanup no-confirm | display json') json_output = output_to_json(results) if not json_output: return 1 if 'success' not in json_output['system-storage-cleanup-information'][0]: logger.info(json_output) logger.error('Command did not run successfully') return 1 logger.info('Copy image to device') cmd = 'file copy "https://apt.wikimedia.org/junos/{}" /var/tmp/'.format( args.image) device.run_sync(cmd) logger.info('Compare checksums') cmd = 'file checksum sha1 /var/tmp/{} | display json'.format(args.image) results = device.run_sync(cmd) json_output = output_to_json(results) if not json_output: return 1 try: dst_checksum = json_output['checksum-information'][0]['file-checksum'][ 0]['checksum'][0]['data'] except KeyError: logger.info(json_output) logger.error( 'Can\'t generate destination side checksum, did the file copy go well?' ) return 1 if src_checksum != dst_checksum: logger.error('Checksum missmatch, maybe partial file transfer?') return 1 logger.info('Save rescue config') results = device.run_sync( 'request system configuration rescue save | display json') json_output = output_to_json(results) if not json_output: return 1 if 'success' not in json_output['rescue-management-results'][0][ 'routing-engine'][0]: logger.info(json_output) logger.error('Command did not run successfully.') return 1 logger.info('Validate image') if 'vmhost' in args.image: logger.info('Introduced in Junos OS Release 18.4R1, good luck.') else: cmd = 'request system software validate /var/tmp/{}.tgz'.format( args.image) if not present_in_output(device.run_sync(cmd), 'Validation succeeded'): logger.error('Validation failed, try running it manually.') return 1 logger.info('Ready for next cookbook') return 0
def test_ensure_shell_is_durable_sty(mocked_isatty, env_name, env_value, monkeypatch): """Should not raise if in an interactive shell with STY set, TMUX set or a screen-line TERM.""" mocked_isatty.return_value = True monkeypatch.setenv(env_name, env_value) interactive.ensure_shell_is_durable() assert mocked_isatty.called
def __init__(self, args, spicerack): """Initiliaze the reimage runner.""" ensure_shell_is_durable() self.args = args self.host = self.args.host self.netbox = spicerack.netbox() self.netbox_server = spicerack.netbox_server(self.host, read_write=True) self.netbox_data = self.netbox_server.as_dict() ask_confirmation( f'ATTENTION: destructive action for host: {self.host}\nAre you sure to proceed?' ) # Shortcut variables self.fqdn = self.netbox_server.fqdn self.mgmt_fqdn = self.netbox_server.mgmt_fqdn self.output_filename = self._get_output_filename(spicerack.username) self.actions = spicerack.actions self.host_actions = self.actions[self.host] self.confctl_services = [] if self.netbox_server.virtual: raise RuntimeError( f'Host {self.host} is a virtual machine. VMs are not yet supported.' ) self.dns = spicerack.dns() self.icinga_host = spicerack.icinga_hosts([self.host]) self.ipmi = spicerack.ipmi(self.mgmt_fqdn) self.reason = spicerack.admin_reason('Host reimage', task_id=self.args.task_id) self.puppet_master = spicerack.puppet_master() self.debmonitor = spicerack.debmonitor() self.confctl = spicerack.confctl('node') self.remote = spicerack.remote() self.spicerack = spicerack try: self.remote_host = self.remote.query(self.fqdn) if self.args.new: ask_confirmation( f'Host {self.fqdn} was found in PuppetDB but --new was set. Are you sure you want to ' 'proceed? The --new option will be unset') self.args.new = False # Unset --new logger.info('The option --new has been unset') except RemoteError as e: self.remote_host = self.remote.query( f'D{{{self.fqdn}}}') # Use the Direct backend instead if not self.args.new: raise RuntimeError( f'Host {self.fqdn} was not found in PuppetDB but --new was not set. Check that the ' 'FQDN is correct. If the host is new or has disappeared from PuppetDB because down ' 'for too long use --new.') from e if len(self.remote_host) != 1: raise RuntimeError( f'Expected 1 host for query {self.fqdn} but got {len(self.remote_host)}: {self.remote_host}' ) # The same as self.remote_host but using the SSH key valid only during installation before the first Puppet run self.remote_installer = spicerack.remote(installer=True).query( self.fqdn) # Get a Puppet instance for the current cumin host to update the known hosts file remote_localhost = self.remote.query(f'{self.reason.hostname}.*') if len(remote_localhost) != 1: raise RuntimeError( f'Localhost matched the wrong number of hosts ({len(remote_localhost)}) for ' f'query "{self.reason.hostname}.*": {remote_localhost}') self.puppet_localhost = spicerack.puppet(remote_localhost) self.puppet = spicerack.puppet(self.remote_host) # The same as self.puppet but using the SSH key valid only during installation before the first Puppet run self.puppet_installer = spicerack.puppet(self.remote_installer) # DHCP automation try: self.dhcp_hosts = self.remote.query( f'A:installserver-light and A:{self.netbox_data["site"]["slug"]}' ) except RemoteError: # Fallback to eqiad's install server if the above fails, i.e. for a new DC self.dhcp_hosts = self.remote.query( 'A:installserver-light and A:eqiad') self.dhcp = spicerack.dhcp(self.dhcp_hosts) self.dhcp_config = self._get_dhcp_config() self._validate() # Keep track of some specific actions for the eventual rollback self.rollback_masks = False self.rollback_depool = False if self.args.task_id is not None: self.phabricator = spicerack.phabricator( PHABRICATOR_BOT_CONFIG_FILE) else: self.phabricator = None