Exemplo n.º 1
0
 def spawn_updater(self, node_id, init_commands, ray_start_commands,
                   node_resources, docker_config):
     logger.info(f"Creating new (spawn_updater) updater thread for node"
                 f" {node_id}.")
     updater = NodeUpdaterThread(
         node_id=node_id,
         provider_config=self.config["provider"],
         provider=self.provider,
         auth_config=self.config["auth"],
         cluster_name=self.config["cluster_name"],
         file_mounts=self.config["file_mounts"],
         initialization_commands=with_head_node_ip(
             self._get_node_type_specific_fields(node_id,
                                                 "initialization_commands"),
             self.head_node_ip),
         setup_commands=with_head_node_ip(init_commands, self.head_node_ip),
         ray_start_commands=with_head_node_ip(ray_start_commands,
                                              self.head_node_ip),
         runtime_hash=self.runtime_hash,
         file_mounts_contents_hash=self.file_mounts_contents_hash,
         is_head_node=False,
         cluster_synced_files=self.config["cluster_synced_files"],
         rsync_options={
             "rsync_exclude": self.config.get("rsync_exclude"),
             "rsync_filter": self.config.get("rsync_filter")
         },
         process_runner=self.process_runner,
         use_internal_ip=True,
         docker_config=docker_config,
         node_resources=node_resources)
     updater.start()
     self.updaters[node_id] = updater
Exemplo n.º 2
0
 def recover_if_needed(self, node_id, now):
     if not self.can_update(node_id):
         return
     key = self.provider.internal_ip(node_id)
     if key not in self.load_metrics.last_heartbeat_time_by_ip:
         self.load_metrics.last_heartbeat_time_by_ip[key] = now
     last_heartbeat_time = self.load_metrics.last_heartbeat_time_by_ip[key]
     delta = now - last_heartbeat_time
     if delta < AUTOSCALER_HEARTBEAT_TIMEOUT_S:
         return
     logger.warning("StandardAutoscaler: "
                    "{}: No heartbeat in {}s, "
                    "restarting Ray to recover...".format(node_id, delta))
     updater = NodeUpdaterThread(
         node_id=node_id,
         provider_config=self.config["provider"],
         provider=self.provider,
         auth_config=self.config["auth"],
         cluster_name=self.config["cluster_name"],
         file_mounts={},
         initialization_commands=[],
         setup_commands=[],
         ray_start_commands=with_head_node_ip(
             self.config["worker_start_ray_commands"], self.head_node_ip),
         runtime_hash=self.runtime_hash,
         file_mounts_contents_hash=self.file_mounts_contents_hash,
         process_runner=self.process_runner,
         use_internal_ip=True,
         is_head_node=False,
         docker_config=self.config.get("docker"),
         node_resources=self._node_resources(node_id))
     updater.start()
     self.updaters[node_id] = updater
Exemplo n.º 3
0
    def recover_if_needed(self, node_id, now):
        if not self.can_update(node_id):
            return
        if self.heartbeat_on_time(node_id, now):
            return

        logger.warning("StandardAutoscaler: "
                       "{}: No recent heartbeat, "
                       "restarting Ray to recover...".format(node_id))
        self.event_summarizer.add("Restarting {} nodes of type " +
                                  self._get_node_type(node_id) +
                                  " (lost contact with raylet).",
                                  quantity=1,
                                  aggregate=operator.add)
        updater = NodeUpdaterThread(
            node_id=node_id,
            provider_config=self.config["provider"],
            provider=self.provider,
            auth_config=self.config["auth"],
            cluster_name=self.config["cluster_name"],
            file_mounts={},
            initialization_commands=[],
            setup_commands=[],
            ray_start_commands=with_head_node_ip(
                self.config["worker_start_ray_commands"], self.head_node_ip),
            runtime_hash=self.runtime_hash,
            file_mounts_contents_hash=self.file_mounts_contents_hash,
            process_runner=self.process_runner,
            use_internal_ip=True,
            is_head_node=False,
            docker_config=self.config.get("docker"),
            node_resources=self._node_resources(node_id),
            for_recovery=True)
        updater.start()
        self.updaters[node_id] = updater