def spawn_updater(self, node_id, init_commands, ray_start_commands, node_resources, docker_config): logger.info(f"Creating new (spawn_updater) updater thread for node" f" {node_id}.") updater = NodeUpdaterThread( node_id=node_id, provider_config=self.config["provider"], provider=self.provider, auth_config=self.config["auth"], cluster_name=self.config["cluster_name"], file_mounts=self.config["file_mounts"], initialization_commands=with_head_node_ip( self._get_node_type_specific_fields(node_id, "initialization_commands"), self.head_node_ip), setup_commands=with_head_node_ip(init_commands, self.head_node_ip), ray_start_commands=with_head_node_ip(ray_start_commands, self.head_node_ip), runtime_hash=self.runtime_hash, file_mounts_contents_hash=self.file_mounts_contents_hash, is_head_node=False, cluster_synced_files=self.config["cluster_synced_files"], rsync_options={ "rsync_exclude": self.config.get("rsync_exclude"), "rsync_filter": self.config.get("rsync_filter") }, process_runner=self.process_runner, use_internal_ip=True, docker_config=docker_config, node_resources=node_resources) updater.start() self.updaters[node_id] = updater
def recover_if_needed(self, node_id, now): if not self.can_update(node_id): return key = self.provider.internal_ip(node_id) if key not in self.load_metrics.last_heartbeat_time_by_ip: self.load_metrics.last_heartbeat_time_by_ip[key] = now last_heartbeat_time = self.load_metrics.last_heartbeat_time_by_ip[key] delta = now - last_heartbeat_time if delta < AUTOSCALER_HEARTBEAT_TIMEOUT_S: return logger.warning("StandardAutoscaler: " "{}: No heartbeat in {}s, " "restarting Ray to recover...".format(node_id, delta)) updater = NodeUpdaterThread( node_id=node_id, provider_config=self.config["provider"], provider=self.provider, auth_config=self.config["auth"], cluster_name=self.config["cluster_name"], file_mounts={}, initialization_commands=[], setup_commands=[], ray_start_commands=with_head_node_ip( self.config["worker_start_ray_commands"], self.head_node_ip), runtime_hash=self.runtime_hash, file_mounts_contents_hash=self.file_mounts_contents_hash, process_runner=self.process_runner, use_internal_ip=True, is_head_node=False, docker_config=self.config.get("docker"), node_resources=self._node_resources(node_id)) updater.start() self.updaters[node_id] = updater
def recover_if_needed(self, node_id, now): if not self.can_update(node_id): return if self.heartbeat_on_time(node_id, now): return logger.warning("StandardAutoscaler: " "{}: No recent heartbeat, " "restarting Ray to recover...".format(node_id)) self.event_summarizer.add("Restarting {} nodes of type " + self._get_node_type(node_id) + " (lost contact with raylet).", quantity=1, aggregate=operator.add) updater = NodeUpdaterThread( node_id=node_id, provider_config=self.config["provider"], provider=self.provider, auth_config=self.config["auth"], cluster_name=self.config["cluster_name"], file_mounts={}, initialization_commands=[], setup_commands=[], ray_start_commands=with_head_node_ip( self.config["worker_start_ray_commands"], self.head_node_ip), runtime_hash=self.runtime_hash, file_mounts_contents_hash=self.file_mounts_contents_hash, process_runner=self.process_runner, use_internal_ip=True, is_head_node=False, docker_config=self.config.get("docker"), node_resources=self._node_resources(node_id), for_recovery=True) updater.start() self.updaters[node_id] = updater