def spawn_updater(self, node_id, init_commands, ray_start_commands, node_resources, docker_config): logger.info(f"Creating new (spawn_updater) updater thread for node" f" {node_id}.") updater = NodeUpdaterThread( node_id=node_id, provider_config=self.config["provider"], provider=self.provider, auth_config=self.config["auth"], cluster_name=self.config["cluster_name"], file_mounts=self.config["file_mounts"], initialization_commands=with_head_node_ip( self._get_node_type_specific_fields( node_id, "initialization_commands")), setup_commands=with_head_node_ip(init_commands), ray_start_commands=with_head_node_ip(ray_start_commands), runtime_hash=self.runtime_hash, file_mounts_contents_hash=self.file_mounts_contents_hash, is_head_node=False, cluster_synced_files=self.config["cluster_synced_files"], process_runner=self.process_runner, use_internal_ip=True, docker_config=docker_config, node_resources=node_resources) updater.start() self.updaters[node_id] = updater
def recover_if_needed(self, node_id, now): if not self.can_update(node_id): return key = self.provider.internal_ip(node_id) if key not in self.load_metrics.last_heartbeat_time_by_ip: self.load_metrics.last_heartbeat_time_by_ip[key] = now last_heartbeat_time = self.load_metrics.last_heartbeat_time_by_ip[key] delta = now - last_heartbeat_time if delta < AUTOSCALER_HEARTBEAT_TIMEOUT_S: return logger.warning("StandardAutoscaler: " "{}: No heartbeat in {}s, " "restarting Ray to recover...".format(node_id, delta)) updater = NodeUpdaterThread( node_id=node_id, provider_config=self.config["provider"], provider=self.provider, auth_config=self.config["auth"], cluster_name=self.config["cluster_name"], file_mounts={}, initialization_commands=[], setup_commands=[], ray_start_commands=with_head_node_ip( self.config["worker_start_ray_commands"]), runtime_hash=self.runtime_hash, process_runner=self.process_runner, use_internal_ip=True, docker_config=self.config.get("docker")) updater.start() self.updaters[node_id] = updater
def spawn_updater(self, node_id, init_commands, ray_start_commands): updater = NodeUpdaterThread( node_id=node_id, provider_config=self.config["provider"], provider=self.provider, auth_config=self.config["auth"], cluster_name=self.config["cluster_name"], file_mounts=self.config["file_mounts"], initialization_commands=with_head_node_ip( self.config["initialization_commands"]), setup_commands=with_head_node_ip(init_commands), ray_start_commands=with_head_node_ip(ray_start_commands), runtime_hash=self.runtime_hash, process_runner=self.process_runner, use_internal_ip=True) updater.start() self.updaters[node_id] = updater