def _launch_block(self, external_block_id: str) -> Any: if self.launch_cmd is None: raise ScalingFailed(self.provider.label, "No launch command") launch_cmd = self.launch_cmd.format(block_id=external_block_id) internal_block = self.provider.submit(launch_cmd, 1) logger.debug("Launched block {}->{}".format(external_block_id, internal_block)) if not internal_block: raise(ScalingFailed(self.provider.label, "Attempts to provision nodes via provider has failed")) return internal_block
def scale_out(self, blocks=1): """Scales out the number of active workers by 1. This method is notImplemented for threads and will raise the error if called. Parameters: blocks : int Number of blocks to be provisioned. """ r = [] for i in range(blocks): if self.provider: block = self.provider.submit(self.launch_cmd, 1, self.workers_per_node) logger.debug("Launched block {}:{}".format(i, block)) if not block: raise (ScalingFailed( self.provider.label, "Attempts to provision nodes via provider has failed")) self.engines.extend([block]) r.extend([block]) else: logger.error("No execution provider available") r = None return r
def scale_out(self, blocks=1): """Scales out the number of active workers by the number of blocks specified. Parameters ---------- blocks : int # of blocks to scale out. Default=1 Raises: NotImplementedError """ r = [] for i in range(blocks): if self.provider: block = self.provider.submit(self.launch_cmd, self.workers_per_node) logger.debug("Launched block {}:{}".format(i, block)) if not block: raise (ScalingFailed( self.provider.label, "Attempts to provision nodes via provider has failed")) self.blocks.extend([block]) else: logger.error("No execution provider available") r = None return r
def scale_out(self, blocks=1, task_type=None): """Scales out the number of blocks by "blocks" Raises: NotImplementedError """ r = [] for i in range(blocks): if self.config.provider: self._block_counter += 1 external_block_id = str(self._block_counter) if not task_type and self.config.scheduler_mode == 'hard': launch_cmd = self.launch_cmd.format(block_id=external_block_id, worker_type='RAW') else: launch_cmd = self.launch_cmd.format(block_id=external_block_id, worker_type=task_type) if not task_type: internal_block = self.config.provider.submit(launch_cmd, 1) else: internal_block = self.config.provider.submit(launch_cmd, 1, task_type) logger.debug("Launched block {}->{}".format(external_block_id, internal_block)) if not internal_block: raise(ScalingFailed(self.provider.label, "Attempts to provision nodes via provider has failed")) self.blocks[external_block_id] = internal_block self.block_id_map[internal_block] = external_block_id else: logger.error("No execution provider available") r = None return r
def _submit_flux_jobs( submission_queue: queue.Queue, stop_event: threading.Event, socket: zmq.Socket, working_dir: str, flux_executor_kwargs: Mapping, provider: ExecutionProvider, executor: FluxExecutor, flux_path: str, launch_cmd: str, ): """Function to be run in a separate thread by executor. Pull ``_FluxJobInfo`` job packages from a queue and submit them to Flux. """ provider.script_dir = working_dir # type: ignore job_id = provider.submit( launch_cmd.format( port=socket.bind_to_random_port("tcp://*"), protocol="tcp", hostname=gethostname(), python=sys.executable, flux=flux_path, manager=_MANAGER_PATH, ), 1, ) if not job_id: raise ScalingFailed( executor, "Attempt to provision nodes via provider has failed", ) # wait for the flux package path to be sent _check_provider_job(socket, provider, job_id) # receive path to the ``flux`` package from the ZMQ socket flux_pkg_path = socket.recv().decode() # type: ignore # load the package. Unfortunately the only good way to do this is to # modify sys.path if flux_pkg_path not in sys.path: sys.path.append(flux_pkg_path) import flux.job socket.send(b"ack") # dummy message # receive the URI of the Flux instance launched by provider _check_provider_job(socket, provider, job_id) flux_instance_uri = socket.recv() # create a ``flux.job.FluxExecutor`` connected to remote Flux instance with flux.job.FluxExecutor(handle_args=(flux_instance_uri, ), **flux_executor_kwargs) as flux_executor: # need to ensure that no jobs submitted after stop_event set # exit loop when event is set and queue is drained while not stop_event.is_set() or not submission_queue.empty(): try: jobinfo = submission_queue.get(timeout=0.05) except queue.Empty: pass else: _submit_single_job(flux_executor, working_dir, jobinfo) socket.send(b"shutdown")
def _launch_block(self, block_id: str) -> Any: launch_cmd = self._get_launch_command(block_id) job_id = self.provider.submit(launch_cmd, 1) if job_id: logger.debug( f"Launched block {block_id} on executor {self.label} with job ID {job_id}" ) else: raise ScalingFailed( self, "Attempt to provision nodes did not return a job ID") return job_id
def start(self): """Create the Interchange process and connect to it. """ self.outgoing_q = zmq_pipes.TasksOutgoing("127.0.0.1", self.interchange_port_range) self.incoming_q = zmq_pipes.ResultsIncoming( "127.0.0.1", self.interchange_port_range) self.is_alive = True self._queue_management_thread = None self._start_queue_management_thread() self._start_local_queue_process() logger.debug("Created management thread: {}".format( self._queue_management_thread)) if self.provider: # debug_opts = "--debug" if self.worker_debug else "" l_cmd = self.launch_cmd.format( # debug=debug_opts, task_url=self.worker_task_url, workers_per_node=self.workers_per_node, logdir="{}/{}".format(self.run_dir, self.label)) self.launch_cmd = l_cmd logger.debug("Launch command: {}".format(self.launch_cmd)) self._scaling_enabled = True logger.debug("Starting LowLatencyExecutor with provider:\n%s", self.provider) if hasattr(self.provider, 'init_blocks'): try: for i in range(self.provider.init_blocks): block = self.provider.submit(self.launch_cmd, self.workers_per_node) logger.debug("Launched block {}:{}".format(i, block)) if not block: raise (ScalingFailed( self.provider.label, "Attempts to provision nodes via provider has failed" )) self.blocks.extend([block]) except Exception as e: logger.error("Scaling out failed: {}".format(e)) raise e else: self._scaling_enabled = False logger.debug("Starting LowLatencyExecutor with no provider")
def scale_out(self, blocks=1): """Scales out the number of blocks by "blocks" Raises: NotImplementedError """ r = [] for i in range(blocks): external_block_id = str(len(self.blocks)) launch_cmd = self.launch_cmd.format(block_id=external_block_id) internal_block = self.provider.submit(launch_cmd, 1) logger.debug("Launched block {}->{}".format(external_block_id, internal_block)) if not internal_block: raise(ScalingFailed(self.provider.label, "Attempts to provision nodes via provider has failed")) r.extend([external_block_id]) self.blocks[external_block_id] = internal_block return r
def scale_out(self, blocks=1): """Scale out method. We should have the scale out method simply take resource object which will have the scaling methods, scale_out itself should be a coroutine, since scaling tasks can be slow. """ if self.provider: for i in range(blocks): external_block = str(len(self.blocks)) internal_block = self.provider.submit(self.worker_command, 1) # Failed to create block with provider if not internal_block: raise(ScalingFailed(self.provider.label, "Attempts to create nodes using the provider has failed")) else: self.blocks[external_block] = internal_block else: logger.error("No execution provider available to scale")
def scale_out(self, blocks=1): """Scales out the number of blocks by "blocks" Raises: NotImplementedError """ if not self.provider: raise (ScalingFailed("No execution provider available")) r = [] for i in range(blocks): external_block_id = str(len(self.blocks)) try: self.blocks[external_block_id] = self._launch_block(external_block_id) r.append(external_block_id) except Exception as ex: self._fail_job_async(external_block_id, "Failed to start block {}: {}".format(external_block_id, ex)) return r
def scale_out(self, blocks=1): """Scales out the number of blocks by "blocks" """ if not self.provider: raise (ScalingFailed(None, "No execution provider available")) block_ids = [] for i in range(blocks): block_id = str(len(self.blocks)) try: job_id = self._launch_block(block_id) self.blocks[block_id] = job_id self.block_mapping[job_id] = block_id block_ids.append(block_id) except Exception as ex: self._fail_job_async( block_id, "Failed to start block {}: {}".format(block_id, ex)) return block_ids
def scale_out(self, blocks: int = 1) -> List[str]: """Scales out the number of blocks by "blocks" """ if not self.provider: raise ScalingFailed(self, "No execution provider available") block_ids = [] logger.info(f"Scaling out by {blocks} blocks") for i in range(blocks): block_id = str(self._block_id_counter.get_id()) logger.info(f"Allocated block ID {block_id}") try: job_id = self._launch_block(block_id) self.blocks[block_id] = job_id self.block_mapping[job_id] = block_id block_ids.append(block_id) except Exception as ex: self._fail_job_async( block_id, "Failed to start block {}: {}".format(block_id, ex)) return block_ids
def scale_out(self, blocks=1): """Scales out the number of blocks by "blocks" Raises: NotImplementedError """ r = [] for i in range(blocks): if self.provider: block = self.provider.submit(self.launch_cmd, 1, 1) log.debug(f"Launched block {i}:{block}") if not block: raise (ScalingFailed( self.provider.label, "Attempts to provision nodes via provider has failed", )) self.blocks.extend([block]) else: log.error("No execution provider available") r = None return r
def _get_launch_command(self, block_id: str) -> str: if self.launch_cmd is None: raise ScalingFailed(self, "No launch command") launch_cmd = self.launch_cmd.format(block_id=block_id) return launch_cmd