def joined(self): self.relations[0].to_publish_raw['unit_name'] = self._unit_name prefix = self.expand_name('{endpoint_name}.') ca_available = self.root_ca_cert ca_changed = ca_available and data_changed(prefix + 'ca', self.root_ca_cert) server_available = self.server_certs server_changed = server_available and data_changed( prefix + 'servers', self.server_certs) client_available = self.client_certs client_changed = client_available and data_changed( prefix + 'clients', self.client_certs) certs_available = server_available or client_available certs_changed = server_changed or client_changed set_flag(prefix + 'available') toggle_flag(prefix + 'ca.available', ca_available) toggle_flag(prefix + 'ca.changed', ca_changed) toggle_flag(prefix + 'server.certs.available', server_available) toggle_flag(prefix + 'server.certs.changed', server_changed) toggle_flag(prefix + 'client.certs.available', client_available) toggle_flag(prefix + 'client.certs.changed', client_changed) toggle_flag(prefix + 'certs.available', certs_available) toggle_flag(prefix + 'certs.changed', certs_changed) # deprecated toggle_flag(prefix + 'server.cert.available', self.server_certs) toggle_flag(prefix + 'client.cert.available', self.get_client_cert()) toggle_flag(prefix + 'batch.cert.available', self.server_certs)
def join_or_update_cohorts(): """Join or update a cohort snapshot. All units of this application (leader and followers) need to refresh their installed snaps to the current cohort snapshot. """ kube_control = endpoint_from_flag("kube-control.cohort_keys.available") cohort_keys = kube_control.cohort_keys for snapname in cohort_snaps: hookenv.status_set("maintenance", "Joining cohort for {}.".format(snapname)) cohort_key = cohort_keys[snapname] for delay in (5, 30, 60): try: snap.join_cohort_snapshot(snapname, cohort_key) hookenv.log("Joined cohort for {}".format(snapname)) break except subprocess.CalledProcessError: hookenv.log( "Error joining cohort for {}".format(snapname), level=hookenv.ERROR ) hookenv.status_set( "maintenance", "Error joining cohort for {} (see logs), " "will retry.".format(snapname), ) time.sleep(delay) else: set_flag("kubernetes-worker.cohorts.failed") return # Update our cache of the cohort keys, now that they're successfully applied. data_changed("master-cohorts", cohort_keys) set_flag("kubernetes-worker.cohorts.joined") clear_flag("kubernetes-worker.cohorts.failed")
def watch_for_changes(): """Watch for configuration changes and signal if we need to restart the worker services""" kube_control = endpoint_from_flag("kube-control.dns.available") container_runtime = endpoint_from_flag("endpoint.container-runtime.available") servers = get_kube_api_servers() dns = kube_control.get_dns() cluster_cidr = kubernetes_common.cluster_cidr() container_runtime_name = container_runtime.get_runtime() container_runtime_socket = container_runtime.get_socket() container_runtime_nvidia = container_runtime.get_nvidia_enabled() if container_runtime_nvidia: set_state("nvidia.ready") else: remove_state("nvidia.ready") if ( data_changed("kube-api-servers", servers) or data_changed("kube-dns", dns) or data_changed("cluster-cidr", cluster_cidr) or data_changed("container-runtime", container_runtime_name) or data_changed("container-socket", container_runtime_socket) ): set_state("kubernetes-worker.restart-needed")
def set_cert(self, cert, key): rel = self._unit.relation if self._is_top_level_server_cert: # backwards compatibility; if this is the cert that was requested # as a single server cert, set it in the response as the single # server cert rel.to_publish_raw.update({ self._server_cert_key: cert, self._server_key_key: key, }) else: data = rel.to_publish.get(self._publish_key, {}) data[self.common_name] = { 'cert': cert, 'key': key, } rel.to_publish[self._publish_key] = data if not rel.endpoint.new_server_requests: clear_flag( rel.endpoint.expand_name('{endpoint_name}.server' '.cert.requested')) if not rel.endpoint.new_requests: clear_flag( rel.endpoint.expand_name('{endpoint_name}.' 'certs.requested')) data_changed(self._key, sorted(set(self.sans or [])))
def install_etcd_credentials(): etcd = endpoint_from_flag('etcd.available') etcd.save_client_credentials(ETCD_KEY_PATH, ETCD_CERT_PATH, ETCD_CA_PATH) # register initial etcd data so that we can detect changes data_changed('calico.etcd.data', (etcd.get_connection_string(), etcd.get_client_credentials())) set_state('calico.etcd-credentials.installed')
def handle_remote_config(self): """ Keep track of received data so we can know if it changes. :return: None """ clear_flag(self.expand_name('endpoint.{endpoint_name}.reconfigure')) # Presently, we only care about one piece of remote config. Expand # the list as needed. data_changed('containerd.remote_config', [self.get_sandbox_image()])
def update_status_info(): endpoint = endpoint_from_flag('endpoint.kubernetes-deployer.available') status = check_predefined_resources() error_states = unitdata.kv().get('error-states', {}) status.update(error_states) worker_ips = get_worker_node_ips() # Only report if the status has changed if (data_changed('status-info', status) or data_changed('worker-ips', worker_ips)): endpoint.send_status(status) endpoint.send_worker_ips(worker_ips)
def process_snapd_timer(): """ Set the snapd refresh timer on the leader so all cluster members (present and future) will refresh near the same time. :return: None """ # Get the current snapd refresh timer; we know layer-snap has set this # when the 'snap.refresh.set' flag is present. timer = snap.get(snapname="core", key="refresh.timer").decode("utf-8").strip() if not timer: # The core snap timer is empty. This likely means a subordinate timer # reset ours. Try to set it back to a previously leader-set value, # falling back to config if needed. Luckily, this should only happen # during subordinate install, so this should remain stable afterward. timer = leader_get("snapd_refresh") or hookenv.config("snapd_refresh") snap.set_refresh_timer(timer) # Ensure we have the timer known by snapd (it may differ from config). timer = snap.get(snapname="core", key="refresh.timer").decode("utf-8").strip() # The first time through, data_changed will be true. Subsequent calls # should only update leader data if something changed. if data_changed("snapd_refresh", timer): hookenv.log("setting leader snapd_refresh timer to: {}".format(timer)) leader_set({"snapd_refresh": timer})
def _handle_changed(self): set_flag(self.expand_name("{endpoint_name}.connected")) if self.connection_string(): set_flag(self.expand_name("{endpoint_name}.available")) data_key = self.expand_name("endpoint.{endpoint_name}.data") if data_changed(data_key, self.connection_string()): set_flag(self.expand_name("{endpoint_name}.changed"))
def _get_secret_id(vault): token = vault.unit_token if data_changed("layer.vault-kv.token", token): log("Changed unit_token, getting new secret_id") # token is one-shot, but if it changes it might mean that we're # being told to rotate the secret ID, or we might not have fetched # one yet vault_url = vault.vault_url try: secret_id = retrieve_secret_id(vault_url, token) except ( requests.exceptions.ConnectionError, hvac.exceptions.VaultDown, hvac.exceptions.VaultNotInitialized, hvac.exceptions.BadGateway, ) as e: raise VaultNotReady() from e unitdata.kv().set("layer.vault-kv.secret_id", secret_id) # have to flush immediately because if we don't and hit some error # elsewhere, it could get us into a state where we have forgotten the # secret ID and can't retrieve it again because we've already used the # token unitdata.kv().flush() else: secret_id = unitdata.kv().get("layer.vault-kv.secret_id") return secret_id
def check_config_changed(): try: config = vault_kv.get_vault_config() except vault_kv.VaultNotReady: return else: if data_changed("layer.vault-kv.config", config): set_flag("layer.vault-kv.config.changed")
def manage_flags(self): toggle_flag(self.expand_name('{endpoint_name}.connected'), self.is_joined) toggle_flag(self.expand_name('{endpoint_name}.gpu.available'), self.is_joined and self._get_gpu()) requests_data_id = self.expand_name('{endpoint_name}.requests') requests = self.auth_user() if data_changed(requests_data_id, requests): set_flag(self.expand_name('{endpoint_name}.requests.changed'))
def check_etcd_changes(): etcd = endpoint_from_flag('etcd.available') if data_changed( 'calico.etcd.data', (etcd.get_connection_string(), etcd.get_client_credentials())): etcd.save_client_credentials(ETCD_KEY_PATH, ETCD_CERT_PATH, ETCD_CA_PATH) remove_state('calico.service.installed') remove_state('calico.npc.deployed')
def _changed(self): # Set the master/standby changed flags. The charm is # responsible for clearing this, if it cares. Flags are # cleared before being set to ensure triggers are triggered. upgrade = hookenv.hook_name() == 'upgrade-charm' self._reset_all_flags() key = self.expand_name('endpoint.{endpoint_name}.master.changed') if data_changed(key, [str(cs.master) for cs in self]) or (self.master and upgrade): self._clear_flag('{endpoint_name}.master.changed') self._set_flag('{endpoint_name}.master.changed') self._clear_flag('{endpoint_name}.database.changed') self._set_flag('{endpoint_name}.database.changed') key = self.expand_name('endpoint.{endpoint_name}.standbys.changed') if data_changed(key, [sorted(str(s) for s in cs.standbys) for cs in self]) or (self.standbys and upgrade): self._clear_flag('{endpoint_name}.standbys.changed') self._set_flag('{endpoint_name}.standbys.changed') self._clear_flag('{endpoint_name}.database.changed') self._set_flag('{endpoint_name}.database.changed') self._clear_flag('endpoint.{endpoint_name}.changed')
def catch_change_in_creds(kube_control): """Request a service restart in case credential updates were detected.""" nodeuser = "******".format(get_node_name().lower()) creds = kube_control.get_auth_credentials(nodeuser) if creds and creds["user"] == nodeuser: # We need to cache the credentials here because if the # control-plane changes (control-plane leader dies and replaced by a new one) # the new control-plane will have no recollection of our certs. db.set("credentials", creds) set_state("worker.auth.bootstrapped") if data_changed("kube-control.creds", creds): set_state("kubernetes-worker.restart-needed")
def start_worker(): """Start kubelet using the provided API and DNS info.""" # Note that the DNS server doesn't necessarily exist at this point. We know # what its IP will eventually be, though, so we can go ahead and configure # kubelet with that info. This ensures that early pods are configured with # the correct DNS even though the server isn't ready yet. kube_control = endpoint_from_flag("kube-control.dns.available") servers = get_kube_api_servers() dns = kube_control.get_dns() dns_domain = dns["domain"] dns_ip = dns["sdn-ip"] registry = get_registry_location() cluster_cidr = kubernetes_common.cluster_cidr() if cluster_cidr is None: hookenv.log("Waiting for cluster cidr.") return if not servers: hookenv.log("Waiting for API server URL") return if kubernetes_common.is_ipv6(cluster_cidr): kubernetes_common.enable_ipv6_forwarding() creds = db.get("credentials") data_changed("kube-control.creds", creds) create_config(servers[get_unit_number() % len(servers)], creds) configure_default_cni(kube_control.get_default_cni()) configure_kubelet(dns_domain, dns_ip, registry, has_xcp=kube_control.has_xcp) configure_kube_proxy(configure_prefix, servers, cluster_cidr) set_state("kubernetes-worker.config.created") restart_unit_services() update_kubelet_status() set_state("kubernetes-worker.label-config-required") set_state("nrpe-external-master.reconfigure") remove_state("kubernetes-worker.restart-needed") remove_state("endpoint.kube-control.has-xcp.changed")
def set_cert(self, cert, key): """Send the cert and key to all units of the application :param cert: TLS Certificate :type cert: str :param key: TLS Private Key :type cert: str """ rel = self._unit.relation for unit in self._unit.relation.units: pub_key = self.derive_publish_key(unit=unit) data = rel.to_publish.get(pub_key, {}) data['app_data'] = { 'cert': cert, 'key': key, } rel.to_publish[pub_key] = data if not rel.endpoint.new_application_requests: clear_flag( rel.endpoint.expand_name( '{endpoint_name}.application.certs.requested')) data_changed(self._key, sorted(set(self.sans or [])))
def is_changed(self): """ Whether or not the request for this instance has changed. """ return data_changed(self.expand_name('all-data'), [ self.vsphere_ip, self.user, self.password, self.datacenter, self.datastore, self.folder, self.respool_path, ])
def get_model(): is_resource = True model = None # model = layer.kubeflow_tf_serving.get_model_from_resource() if not model: is_resource = False model = layer.kubeflow_tf_serving.get_model_from_config() if model: if data_changed('charm.kubeflow-tf-serving.model', model): set_flag('charm.kubeflow-tf-serving.has-model') clear_flag('charm.kubeflow-tf-serving.started') unitdata.kv().set('charm.kf-tf-serving.model', model) unitdata.kv().set('charm.kf-tf-serving.is-resource', is_resource) else: clear_flag('charm.kubeflow-tf-serving.has-model') unitdata.kv().unset('charm.kubeflow-tf-serving.model')
def is_changed(self): """ Whether or not the request for this instance has changed. """ return data_changed(self.expand_name('all-data'), [ self.auth_url, self.region, self.username, self.password, self.user_domain_name, self.project_domain_name, self.project_name, self.endpoint_tls_ca, self.subnet_id, self.floating_network_id, self.lb_method, self.manage_security_groups, ])
def process_snapd_timer(): ''' Set the snapd refresh timer on the leader so all cluster members (present and future) will refresh near the same time. ''' # Get the current snapd refresh timer; we know layer-snap has set this # when the 'snap.refresh.set' flag is present. timer = snap.get(snapname='core', key='refresh.timer').decode('utf-8') if not timer: # A subordinate wiped out our value, so we need to force it to be set # again. Luckily, the subordinate should only wipe it out once, on # first install, so this should remain stable afterward. snap.set_refresh_timer(hookenv.config('snapd_refresh')) timer = snap.get(snapname='core', key='refresh.timer').decode('utf-8') # The first time through, data_changed will be true. Subsequent calls # should only update leader data if something changed. if data_changed('snapd_refresh', timer): hookenv.log('setting leader snapd_refresh timer to: {}'.format(timer)) leadership.leader_set({'snapd_refresh': timer})
def _fetch(): should_set_status = layer.options.get('docker-resource', 'set-status') queue = unitdata.kv().get('layer.docker-resource.pending', []) failed = [] for res_name in queue: prefix = 'layer.docker-resource.{}'.format(res_name) if should_set_status: layer.status.maintenance('fetching resource: {}'.format(res_name)) try: image_info_filename = hookenv.resource_get(res_name) if not image_info_filename: raise ValueError('no filename returned') image_info = yaml.safe_load(Path(image_info_filename).read_text()) if not image_info: raise ValueError('no data returned') except Exception as e: hookenv.log( 'unable to fetch docker resource {}: {}'.format(res_name, e), level=hookenv.ERROR) failed.append(res_name) set_flag('{}.failed'.format(prefix)) clear_flag('{}.available'.format(prefix)) clear_flag('{}.changed'.format(prefix)) else: unitdata.kv().set('{}.image-info'.format(prefix), image_info) was_available = is_flag_set('{}.available'.format(prefix)) is_changed = data_changed(prefix, image_info) set_flag('{}.available'.format(prefix)) clear_flag('{}.failed'.format(prefix)) toggle_flag('{}.changed'.format(prefix), was_available and is_changed) if failed: if should_set_status: pl = 's' if len(failed) > 1 else '' layer.status.blocked( 'unable to fetch resource{}: {}'.format( pl, ', '.join(failed) ) ) unitdata.kv().set('layer.docker-resource.pending', failed) set_flag('layer.docker-resource.pending') else: unitdata.kv().set('layer.docker-resource.pending', []) clear_flag('layer.docker-resource.pending')
def nfs_state_control(mount): """Determine if we should remove the state that controls the re-render and execution of the nfs-relation-changed event because there are changes in the relationship data, and we should re-render any configs""" mount_data = get_first_mount(mount) if mount_data: nfs_relation_data = { "options": mount_data["options"], "host": mount_data["hostname"], "mountpoint": mount_data["mountpoint"], "fstype": mount_data["fstype"], } # Re-execute the rendering if the data has changed. if data_changed("nfs-config", nfs_relation_data): hookenv.log("reconfiguring nfs") remove_state("nfs.configured")
def update_registry_location(): """Handle changes to the container image registry. Monitor the image registry location. If it changes, manage flags to ensure our image-related handlers will be invoked with an accurate registry. """ registry_location = get_registry_location() if registry_location: runtime = endpoint_from_flag("endpoint.container-runtime.available") if runtime: # Construct and send the sandbox image (pause container) to our runtime uri = get_sandbox_image_uri(registry_location) runtime.set_config(sandbox_image=uri) if data_changed("registry-location", registry_location): remove_state("kubernetes-worker.config.created") remove_state("kubernetes-worker.ingress.available") remove_state("nfs.configured") set_state("kubernetes-worker.restart-needed")
def ensure_package_status(): '''Hold or unhold packages per the package_status configuration option. All packages installed using this module and handlers are affected. An mechanism may be added in the future to override this for a subset of installed packages. ''' packages = installed() if not packages: return config = hookenv.config() package_status = config.get('package_status') or '' changed = reactive.data_changed('apt.package_status', (package_status, sorted(packages))) if changed: if package_status == 'hold': hookenv.log('Holding packages {}'.format(','.join(packages))) fetch.apt_hold(packages) else: hookenv.log('Unholding packages {}'.format(','.join(packages))) fetch.apt_unhold(packages) reactive.clear_flag('apt.needs_hold')