示例#1
0
class Interchange(object):
    """ Interchange is a task orchestrator for distributed systems.

    1. Asynchronously queue large volume of tasks (>100K)
    2. Allow for workers to join and leave the union
    3. Detect workers that have failed using heartbeats
    4. Service single and batch requests from workers
    5. Be aware of requests worker resource capacity,
       eg. schedule only jobs that fit into walltime.

    TODO: We most likely need a PUB channel to send out global commands, like shutdown
    """
    def __init__(self,
                 config,
                 client_address="127.0.0.1",
                 interchange_address="127.0.0.1",
                 client_ports=(50055, 50056, 50057),
                 worker_ports=None,
                 worker_port_range=(54000, 55000),
                 cores_per_worker=1.0,
                 worker_debug=False,
                 launch_cmd=None,
                 heartbeat_threshold=60,
                 logdir=".",
                 logging_level=logging.INFO,
                 poll_period=10,
                 endpoint_id=None,
                 suppress_failure=False,
                 max_heartbeats_missed=2
                 ):
        """
        Parameters
        ----------
        config : funcx.Config object
             Funcx config object that describes how compute should be provisioned

        client_address : str
             The ip address at which the parsl client can be reached. Default: "127.0.0.1"

        interchange_address : str
             The ip address at which the workers will be able to reach the Interchange. Default: "127.0.0.1"

        client_ports : triple(int, int, int)
             The ports at which the client can be reached

        launch_cmd : str
             TODO : update

        worker_ports : tuple(int, int)
             The specific two ports at which workers will connect to the Interchange. Default: None

        worker_port_range : tuple(int, int)
             The interchange picks ports at random from the range which will be used by workers.
             This is overridden when the worker_ports option is set. Defauls: (54000, 55000)

        cores_per_worker : float
             cores to be assigned to each worker. Oversubscription is possible
             by setting cores_per_worker < 1.0. Default=1

        worker_debug : Bool
             Enables worker debug logging.

        heartbeat_threshold : int
             Number of seconds since the last heartbeat after which worker is considered lost.

        logdir : str
             Parsl log directory paths. Logs and temp files go here. Default: '.'

        logging_level : int
             Logging level as defined in the logging module. Default: logging.INFO (20)

        endpoint_id : str
             Identity string that identifies the endpoint to the broker

        poll_period : int
             The main thread polling period, in milliseconds. Default: 10ms

        suppress_failure : Bool
             When set to True, the interchange will attempt to suppress failures. Default: False

        max_heartbeats_missed : int
             Number of heartbeats missed before setting kill_event

        """
        self.logdir = logdir
        try:
            os.makedirs(self.logdir)
        except FileExistsError:
            pass

        start_file_logger("{}/interchange.log".format(self.logdir), level=logging_level)
        logger.info("logger location {}".format(logger.handlers))
        logger.info("Initializing Interchange process with Endpoint ID: {}".format(endpoint_id))
        self.config = config
        logger.info("Got config : {}".format(config))

        self.strategy = self.config.strategy
        self.client_address = client_address
        self.interchange_address = interchange_address
        self.suppress_failure = suppress_failure
        self.poll_period = poll_period

        self.serializer = FuncXSerializer()
        logger.info("Attempting connection to client at {} on ports: {},{},{}".format(
            client_address, client_ports[0], client_ports[1], client_ports[2]))
        self.context = zmq.Context()
        self.task_incoming = self.context.socket(zmq.DEALER)
        self.task_incoming.set_hwm(0)
        self.task_incoming.RCVTIMEO = 10  # in milliseconds
        logger.info("Task incoming on tcp://{}:{}".format(client_address, client_ports[0]))
        self.task_incoming.connect("tcp://{}:{}".format(client_address, client_ports[0]))

        self.results_outgoing = self.context.socket(zmq.DEALER)
        self.results_outgoing.set_hwm(0)
        logger.info("Results outgoing on tcp://{}:{}".format(client_address, client_ports[1]))
        self.results_outgoing.connect("tcp://{}:{}".format(client_address, client_ports[1]))

        self.command_channel = self.context.socket(zmq.DEALER)
        self.command_channel.RCVTIMEO = 1000  # in milliseconds
        # self.command_channel.set_hwm(0)
        logger.info("Command channel on tcp://{}:{}".format(client_address, client_ports[2]))
        self.command_channel.connect("tcp://{}:{}".format(client_address, client_ports[2]))
        logger.info("Connected to client")

        self.pending_task_queue = {}
        self.containers = {}
        self.total_pending_task_count = 0
        self.fxs = FuncXClient()

        logger.info("Interchange address is {}".format(self.interchange_address))
        self.worker_ports = worker_ports
        self.worker_port_range = worker_port_range

        self.task_outgoing = self.context.socket(zmq.ROUTER)
        self.task_outgoing.set_hwm(0)
        self.results_incoming = self.context.socket(zmq.ROUTER)
        self.results_incoming.set_hwm(0)

        # initalize the last heartbeat time to start the loop
        self.last_heartbeat = time.time()
        self.max_heartbeats_missed = max_heartbeats_missed

        self.endpoint_id = endpoint_id
        if self.worker_ports:
            self.worker_task_port = self.worker_ports[0]
            self.worker_result_port = self.worker_ports[1]

            self.task_outgoing.bind("tcp://*:{}".format(self.worker_task_port))
            self.results_incoming.bind("tcp://*:{}".format(self.worker_result_port))

        else:
            self.worker_task_port = self.task_outgoing.bind_to_random_port('tcp://*',
                                                                           min_port=worker_port_range[0],
                                                                           max_port=worker_port_range[1], max_tries=100)
            self.worker_result_port = self.results_incoming.bind_to_random_port('tcp://*',
                                                                                min_port=worker_port_range[0],
                                                                                max_port=worker_port_range[1], max_tries=100)

        logger.info("Bound to ports {},{} for incoming worker connections".format(
            self.worker_task_port, self.worker_result_port))

        self._ready_manager_queue = {}

        self.heartbeat_threshold = heartbeat_threshold
        self.blocks = {}  # type: Dict[str, str]
        self.block_id_map = {}
        self.launch_cmd = launch_cmd
        self.last_core_hr_counter = 0
        if not launch_cmd:
            self.launch_cmd = ("funcx-manager {debug} {max_workers} "
                               "-c {cores_per_worker} "
                               "--poll {poll_period} "
                               "--task_url={task_url} "
                               "--result_url={result_url} "
                               "--logdir={logdir} "
                               "--block_id={{block_id}} "
                               "--hb_period={heartbeat_period} "
                               "--hb_threshold={heartbeat_threshold} "
                               "--worker_mode={worker_mode} "
                               "--scheduler_mode={scheduler_mode} "
                               "--worker_type={{worker_type}} ")

        self.current_platform = {'parsl_v': PARSL_VERSION,
                                 'python_v': "{}.{}.{}".format(sys.version_info.major,
                                                               sys.version_info.minor,
                                                               sys.version_info.micro),
                                 'os': platform.system(),
                                 'hname': platform.node(),
                                 'dir': os.getcwd()}

        logger.info("Platform info: {}".format(self.current_platform))
        self._block_counter = 0
        try:
            self.load_config()
        except Exception as e:
            logger.exception("Caught exception")
            raise


    def load_config(self):
        """ Load the config
        """
        logger.info("Loading endpoint local config")
        working_dir = self.config.working_dir
        if self.config.working_dir is None:
            working_dir = "{}/{}".format(self.logdir, "worker_logs")
        logger.info("Setting working_dir: {}".format(working_dir))

        self.config.provider.script_dir = working_dir
        if hasattr(self.config.provider, 'channel'):
            self.config.provider.channel.script_dir = os.path.join(working_dir, 'submit_scripts')
            self.config.provider.channel.makedirs(self.config.provider.channel.script_dir, exist_ok=True)
            os.makedirs(self.config.provider.script_dir, exist_ok=True)

        debug_opts = "--debug" if self.config.worker_debug else ""
        max_workers = "" if self.config.max_workers_per_node == float('inf') \
                      else "--max_workers={}".format(self.config.max_workers_per_node)

        worker_task_url = f"tcp://{self.interchange_address}:{self.worker_task_port}"
        worker_result_url = f"tcp://{self.interchange_address}:{self.worker_result_port}"

        l_cmd = self.launch_cmd.format(debug=debug_opts,
                                       max_workers=max_workers,
                                       cores_per_worker=self.config.cores_per_worker,
                                       #mem_per_worker=self.config.mem_per_worker,
                                       prefetch_capacity=self.config.prefetch_capacity,
                                       task_url=worker_task_url,
                                       result_url=worker_result_url,
                                       nodes_per_block=self.config.provider.nodes_per_block,
                                       heartbeat_period=self.config.heartbeat_period,
                                       heartbeat_threshold=self.config.heartbeat_threshold,
                                       poll_period=self.config.poll_period,
                                       worker_mode=self.config.worker_mode,
                                       scheduler_mode=self.config.scheduler_mode,
                                       logdir=working_dir)
        self.launch_cmd = l_cmd
        logger.info("Launch command: {}".format(self.launch_cmd))

        if self.config.scaling_enabled:
            logger.info("Scaling ...")
            self.scale_out(self.config.provider.init_blocks)


    def get_tasks(self, count):
        """ Obtains a batch of tasks from the internal pending_task_queue

        Parameters
        ----------
        count: int
            Count of tasks to get from the queue

        Returns
        -------
        List of upto count tasks. May return fewer than count down to an empty list
            eg. [{'task_id':<x>, 'buffer':<buf>} ... ]
        """
        tasks = []
        for i in range(0, count):
            try:
                x = self.pending_task_queue.get(block=False)
            except queue.Empty:
                break
            else:
                tasks.append(x)

        return tasks

    def migrate_tasks_to_internal(self, kill_event, status_request):
        """Pull tasks from the incoming tasks 0mq pipe onto the internal
        pending task queue

        Parameters:
        -----------
        kill_event : threading.Event
              Event to let the thread know when it is time to die.
        """
        logger.info("[TASK_PULL_THREAD] Starting")
        task_counter = 0
        poller = zmq.Poller()
        poller.register(self.task_incoming, zmq.POLLIN)

        while not kill_event.is_set():
            # Check when the last heartbeat was.
            # logger.debug(f"[TASK_PULL_THREAD] Last heartbeat: {self.last_heartbeat}")
            if int(time.time() - self.last_heartbeat) > (self.heartbeat_threshold * self.max_heartbeats_missed):
                logger.critical("[TASK_PULL_THREAD] Missed too many heartbeats. Setting kill event.")
                kill_event.set()
                break

            try:
                msg = self.task_incoming.recv_pyobj()
                self.last_heartbeat = time.time()
            except zmq.Again:
                # We just timed out while attempting to receive
                logger.debug("[TASK_PULL_THREAD] {} tasks in internal queue".format(self.total_pending_task_count))
                continue

            if msg == 'STOP':
                kill_event.set()
                break
            elif msg == 'STATUS_REQUEST':
                logger.info("Got STATUS_REQUEST")
                status_request.set()
            else:
                logger.info("[TASK_PULL_THREAD] Received task:{}".format(msg))
                task_type = self.get_container(msg['task_id'].split(";")[1])
                msg['container'] = task_type
                if task_type not in self.pending_task_queue:
                    self.pending_task_queue[task_type] = queue.Queue(maxsize=10 ** 6)
                self.pending_task_queue[task_type].put(msg)
                self.total_pending_task_count += 1
                logger.debug("[TASK_PULL_THREAD] pending task count: {}".format(self.total_pending_task_count))
                task_counter += 1
                logger.debug("[TASK_PULL_THREAD] Fetched task:{}".format(task_counter))

    def get_container(self, container_uuid):
        """ Get the container image location if it is not known to the interchange"""
        if container_uuid not in self.containers:
            if container_uuid == 'RAW' or not container_uuid:
                self.containers[container_uuid] = 'RAW'
            else:
                try:
                    container = self.fxs.get_container(container_uuid, self.config.container_type)
                except Exception:
                    logger.exception("[FETCH_CONTAINER] Unable to resolve container location")
                    self.containers[container_uuid] = 'RAW'
                else:
                    logger.info("[FETCH_CONTAINER] Got container info: {}".format(container))
                    self.containers[container_uuid] = container.get('location', 'RAW')
        return self.containers[container_uuid]

    def get_total_tasks_outstanding(self):
        """ Get the outstanding tasks in total
        """
        outstanding = {}
        for task_type in self.pending_task_queue:
            outstanding[task_type] = outstanding.get(task_type, 0) + self.pending_task_queue[task_type].qsize()
        for manager in self._ready_manager_queue:
            for task_type in self._ready_manager_queue[manager]['tasks']:
                outstanding[task_type] = outstanding.get(task_type, 0) + len(self._ready_manager_queue[manager]['tasks'][task_type])
        return outstanding

    def get_total_live_workers(self):
        """ Get the total active workers
        """
        active = 0
        for manager in self._ready_manager_queue:
            if self._ready_manager_queue[manager]['active']:
                active += self._ready_manager_queue[manager]['max_worker_count']
        return active

    def get_outstanding_breakdown(self):
        """ Get outstanding breakdown per manager and in the interchange queues

        Returns
        -------
        List of status for online elements
        [ (element, tasks_pending, status) ... ]
        """

        pending_on_interchange = self.total_pending_task_count
        # Reporting pending on interchange is a deviation from Parsl
        reply = [('interchange', pending_on_interchange, True)]
        for manager in self._ready_manager_queue:
            resp = (manager.decode('utf-8'),
                    sum([len(tids) for tids in self._ready_manager_queue[manager]['tasks'].values()]),
                    self._ready_manager_queue[manager]['active'])
            reply.append(resp)
        return reply

    def _hold_block(self, block_id):
        """ Sends hold command to all managers which are in a specific block

        Parameters
        ----------
        block_id : str
             Block identifier of the block to be put on hold
        """
        for manager in self._ready_manager_queue:
            if self._ready_manager_queue[manager]['active'] and \
               self._ready_manager_queue[manager]['block_id'] == block_id:
                logger.debug("[HOLD_BLOCK]: Sending hold to manager: {}".format(manager))
                self.hold_manager(manager)

    def hold_manager(self, manager):
        """ Put manager on hold
        Parameters
        ----------

        manager : str
          Manager id to be put on hold while being killed
        """
        if manager in self._ready_manager_queue:
            self._ready_manager_queue[manager]['active'] = False
            reply = True
        else:
            reply = False

    def _command_server(self, kill_event):
        """ Command server to run async command to the interchange
        """
        logger.debug("[COMMAND] Command Server Starting")

        while not kill_event.is_set():
            try:
                command_req = self.command_channel.recv_pyobj()
                logger.debug("[COMMAND] Received command request: {}".format(command_req))
                if command_req == "OUTSTANDING_C":
                    reply = self.get_total_outstanding()

                elif command_req == "MANAGERS":
                    reply = self.get_outstanding_breakdown()

                elif command_req.startswith("HOLD_WORKER"):
                    cmd, s_manager = command_req.split(';')
                    manager = s_manager.encode('utf-8')
                    logger.info("[CMD] Received HOLD_WORKER for {}".format(manager))
                    if manager in self._ready_manager_queue:
                        self._ready_manager_queue[manager]['active'] = False
                        reply = True
                    else:
                        reply = False

                elif command_req == "HEARTBEAT":
                    logger.info("[CMD] Received heartbeat message from hub")
                    reply = "HBT,{}".format(self.endpoint_id)

                elif command_req == "SHUTDOWN":
                    logger.info("[CMD] Received SHUTDOWN command")
                    kill_event.set()
                    reply = True

                else:
                    reply = None

                logger.debug("[COMMAND] Reply: {}".format(reply))
                self.command_channel.send_pyobj(reply)

            except zmq.Again:
                logger.debug("[COMMAND] is alive")
                continue

    def stop(self):
        """Prepare the interchange for shutdown"""
        self._kill_event.set()

        self._task_puller_thread.join()
        self._command_thread.join()

    def start(self, poll_period=None):
        """ Start the Interchange

        Parameters:
        ----------
        poll_period : int
           poll_period in milliseconds
        """
        logger.info("Incoming ports bound")

        if poll_period is None:
            poll_period = self.poll_period

        start = time.time()
        count = 0

        self._kill_event = threading.Event()
        self._status_request = threading.Event()
        self._task_puller_thread = threading.Thread(target=self.migrate_tasks_to_internal,
                                                    args=(self._kill_event, self._status_request, ))
        self._task_puller_thread.start()

        self._command_thread = threading.Thread(target=self._command_server,
                                                args=(self._kill_event, ))
        self._command_thread.start()

        try:
            logger.debug("Starting strategy.")
            self.strategy.start(self)
        except RuntimeError as e:
            # This is raised when re-registering an endpoint as strategy already exists
            logger.debug("Failed to start strategy.")
            logger.info(e)

        poller = zmq.Poller()
        # poller.register(self.task_incoming, zmq.POLLIN)
        poller.register(self.task_outgoing, zmq.POLLIN)
        poller.register(self.results_incoming, zmq.POLLIN)

        # These are managers which we should examine in an iteration
        # for scheduling a job (or maybe any other attention?).
        # Anything altering the state of the manager should add it
        # onto this list.
        interesting_managers = set()

        while not self._kill_event.is_set():
            self.socks = dict(poller.poll(timeout=poll_period))

            # Listen for requests for work
            if self.task_outgoing in self.socks and self.socks[self.task_outgoing] == zmq.POLLIN:
                logger.debug("[MAIN] starting task_outgoing section")
                message = self.task_outgoing.recv_multipart()
                manager = message[0]

                if manager not in self._ready_manager_queue:
                    reg_flag = False

                    try:
                        msg = json.loads(message[1].decode('utf-8'))
                        reg_flag = True
                    except Exception:
                        logger.warning("[MAIN] Got a non-json registration message from manager:{}".format(
                            manager))
                        logger.debug("[MAIN] Message :\n{}\n".format(message))

                    # By default we set up to ignore bad nodes/registration messages.
                    self._ready_manager_queue[manager] = {'last': time.time(),
                                                          'reg_time': time.time(),
                                                          'free_capacity': {'total_workers': 0},
                                                          'max_worker_count': 0,
                                                          'active': True,
                                                          'tasks': collections.defaultdict(set),
                                                          'total_tasks': 0}
                    if reg_flag is True:
                        interesting_managers.add(manager)
                        logger.info("[MAIN] Adding manager: {} to ready queue".format(manager))
                        self._ready_manager_queue[manager].update(msg)
                        logger.info("[MAIN] Registration info for manager {}: {}".format(manager, msg))

                        if (msg['python_v'].rsplit(".", 1)[0] != self.current_platform['python_v'].rsplit(".", 1)[0] or
                            msg['parsl_v'] != self.current_platform['parsl_v']):
                            logger.warn("[MAIN] Manager {} has incompatible version info with the interchange".format(manager))

                            if self.suppress_failure is False:
                                logger.debug("Setting kill event")
                                self._kill_event.set()
                                e = ManagerLost(manager)
                                result_package = {'task_id': -1,
                                                  'exception': self.serializer.serialize(e)}
                                pkl_package = pickle.dumps(result_package)
                                self.results_outgoing.send(pkl_package)
                                logger.warning("[MAIN] Sent failure reports, unregistering manager")
                            else:
                                logger.debug("[MAIN] Suppressing shutdown due to version incompatibility")

                    else:
                        # Registration has failed.
                        if self.suppress_failure is False:
                            logger.debug("Setting kill event for bad manager")
                            self._kill_event.set()
                            e = BadRegistration(manager, critical=True)
                            result_package = {'task_id': -1,
                                              'exception': self.serializer.serialize(e)}
                            pkl_package = pickle.dumps(result_package)
                            self.results_outgoing.send(pkl_package)
                        else:
                            logger.debug("[MAIN] Suppressing bad registration from manager:{}".format(
                                manager))

                else:
                    self._ready_manager_queue[manager]['last'] = time.time()
                    if message[1] == b'HEARTBEAT':
                        logger.debug("[MAIN] Manager {} sends heartbeat".format(manager))
                        self.task_outgoing.send_multipart([manager, b'', PKL_HEARTBEAT_CODE])
                    else:
                        manager_adv = pickle.loads(message[1])
                        logger.debug("[MAIN] Manager {} requested {}".format(manager, manager_adv))
                        self._ready_manager_queue[manager]['free_capacity'].update(manager_adv)
                        self._ready_manager_queue[manager]['free_capacity']['total_workers'] = sum(manager_adv.values())
                        interesting_managers.add(manager)

            # If we had received any requests, check if there are tasks that could be passed

            logger.debug("[MAIN] Managers count (total/interesting): {}/{}".format(
                len(self._ready_manager_queue),
                len(interesting_managers)))

            task_dispatch, dispatched_task = naive_interchange_task_dispatch(interesting_managers,
                                                                             self.pending_task_queue,
                                                                             self._ready_manager_queue,
                                                                             scheduler_mode=self.config.scheduler_mode)
            self.total_pending_task_count -= dispatched_task

            for manager in task_dispatch:
                tasks = task_dispatch[manager]
                if tasks:
                    logger.info("[MAIN] Sending task message {} to manager {}".format(tasks, manager))
                    self.task_outgoing.send_multipart([manager, b'', pickle.dumps(tasks)])

            # Receive any results and forward to client
            if self.results_incoming in self.socks and self.socks[self.results_incoming] == zmq.POLLIN:
                logger.debug("[MAIN] entering results_incoming section")
                manager, *b_messages = self.results_incoming.recv_multipart()
                if manager not in self._ready_manager_queue:
                    logger.warning("[MAIN] Received a result from a un-registered manager: {}".format(manager))
                else:
                    logger.info("[MAIN] Got {} result items in batch".format(len(b_messages)))
                    for b_message in b_messages:
                        r = pickle.loads(b_message)
                        # logger.debug("[MAIN] Received result for task {} from {}".format(r['task_id'], manager))
                        task_type = self.containers[r['task_id'].split(';')[1]]
                        self._ready_manager_queue[manager]['tasks'][task_type].remove(r['task_id'])
                    self._ready_manager_queue[manager]['total_tasks'] -= len(b_messages)
                    self.results_outgoing.send_multipart(b_messages)
                    logger.debug("[MAIN] Current tasks: {}".format(self._ready_manager_queue[manager]['tasks']))
                logger.debug("[MAIN] leaving results_incoming section")

            # logger.debug("[MAIN] entering bad_managers section")
            bad_managers = [manager for manager in self._ready_manager_queue if
                            time.time() - self._ready_manager_queue[manager]['last'] > self.heartbeat_threshold]
            for manager in bad_managers:
                logger.debug("[MAIN] Last: {} Current: {}".format(self._ready_manager_queue[manager]['last'], time.time()))
                logger.warning("[MAIN] Too many heartbeats missed for manager {}".format(manager))
                e = ManagerLost(manager)
                for task_type in self._ready_manager_queue[manager]['tasks']:
                    for tid in self._ready_manager_queue[manager]['tasks'][task_type]:
                        result_package = {'task_id': tid, 'exception': self.serializer.serialize(e)}
                        pkl_package = pickle.dumps(result_package)
                        self.results_outgoing.send(pkl_package)
                logger.warning("[MAIN] Sent failure reports, unregistering manager")
                self._ready_manager_queue.pop(manager, 'None')
                if manager in interesting_managers:
                    interesting_managers.remove(manager)
            logger.debug("[MAIN] ending one main loop iteration")

            if self._status_request.is_set():
                logger.info("status request response")
                result_package = self.get_status_report()
                pkl_package = pickle.dumps(result_package)
                self.results_outgoing.send(pkl_package)
                logger.info("[MAIN] Sent info response")
                self._status_request.clear()

        delta = time.time() - start
        logger.info("Processed {} tasks in {} seconds".format(count, delta))
        logger.warning("Exiting")

    def get_status_report(self):
        """ Get utilization numbers
        """
        total_cores = 0
        total_mem = 0
        core_hrs = 0
        active_managers = 0
        free_capacity = 0
        outstanding_tasks = self.get_total_tasks_outstanding()
        pending_tasks = self.total_pending_task_count
        num_managers = len(self._ready_manager_queue)
        live_workers = self.get_total_live_workers()
        
        for manager in self._ready_manager_queue:
            total_cores += self._ready_manager_queue[manager]['cores']
            total_mem += self._ready_manager_queue[manager]['mem']
            active_dur = abs(time.time() - self._ready_manager_queue[manager]['reg_time'])
            core_hrs += (active_dur * total_cores) / 3600
            if self._ready_manager_queue[manager]['active']:
                active_managers += 1
            free_capacity += self._ready_manager_queue[manager]['free_capacity']['total_workers']

        result_package = {'task_id': -2,
                          'info': {'total_cores': total_cores,
                                   'total_mem' : total_mem,
                                   'new_core_hrs': core_hrs - self.last_core_hr_counter,
                                   'total_core_hrs': round(core_hrs, 2),
                                   'managers': num_managers,
                                   'active_managers': active_managers,
                                   'total_workers': live_workers,
                                   'idle_workers': free_capacity,
                                   'pending_tasks': pending_tasks,
                                   'outstanding_tasks': outstanding_tasks,
                                   'worker_mode': self.config.worker_mode,
                                   'scheduler_mode': self.config.scheduler_mode,
                                   'scaling_enabled': self.config.scaling_enabled,
                                   'mem_per_worker': self.config.mem_per_worker,
                                   'cores_per_worker': self.config.cores_per_worker,
                                   'prefetch_capacity': self.config.prefetch_capacity,
                                   'max_blocks': self.config.provider.max_blocks,
                                   'min_blocks': self.config.provider.min_blocks,
                                   'max_workers_per_node': self.config.max_workers_per_node,
                                   'nodes_per_block': self.config.provider.nodes_per_block
        }}

        self.last_core_hr_counter = core_hrs
        return result_package

    def scale_out(self, blocks=1, task_type=None):
        """Scales out the number of blocks by "blocks"

        Raises:
             NotImplementedError
        """
        r = []
        for i in range(blocks):
            if self.config.provider:
                self._block_counter += 1
                external_block_id = str(self._block_counter)
                if not task_type and self.config.scheduler_mode == 'hard':
                    launch_cmd = self.launch_cmd.format(block_id=external_block_id, worker_type='RAW')
                else:
                    launch_cmd = self.launch_cmd.format(block_id=external_block_id, worker_type=task_type)
                if not task_type:
                    internal_block = self.config.provider.submit(launch_cmd, 1)
                else:
                    internal_block = self.config.provider.submit(launch_cmd, 1, task_type)
                logger.debug("Launched block {}->{}".format(external_block_id, internal_block))
                if not internal_block:
                    raise(ScalingFailed(self.provider.label,
                                        "Attempts to provision nodes via provider has failed"))
                self.blocks[external_block_id] = internal_block
                self.block_id_map[internal_block] = external_block_id
            else:
                logger.error("No execution provider available")
                r = None
        return r

    def scale_in(self, blocks=None, block_ids=[], task_type=None):
        """Scale in the number of active blocks by specified amount.

        Parameters
        ----------
        blocks : int
            # of blocks to terminate

        block_ids : [str.. ]
            List of external block ids to terminate
        """
        if task_type:
            logger.info("Scaling in blocks of specific task type {}. Let the provider decide which to kill".format(task_type))
            if self.config.scaling_enabled and self.config.provider:
                to_kill, r = self.config.provider.cancel(blocks, task_type)
                logger.info("Get the killed blocks: {}, and status: {}".format(to_kill, r))
                for job in to_kill:
                    logger.info("[scale_in] Getting the block_id map {} for job {}".format(self.block_id_map, job))
                    block_id = self.block_id_map[job]
                    logger.info("[scale_in] Holding block {}".format(block_id))
                    self._hold_block(block_id)
                    self.blocks.pop(block_id)
                return r

        if block_ids:
            block_ids_to_kill = block_ids
        else:
            block_ids_to_kill = list(self.blocks.keys())[:blocks]

        # Try a polite terminate
        # TODO : Missing logic to hold blocks
        for block_id in block_ids_to_kill:
            self._hold_block(block_id)

        # Now kill via provider
        to_kill = [self.blocks.pop(bid) for bid in block_ids_to_kill]

        if self.config.scaling_enabled and self.config.provider:
            r = self.config.provider.cancel(to_kill)

        return r

    def provider_status(self):
        """ Get status of all blocks from the provider
        """
        status = []
        if self.config.provider:
            logger.debug("[MAIN] Getting the status of {} blocks.".format(list(self.blocks.values())))
            status = self.config.provider.status(list(self.blocks.values()))
            logger.debug("[MAIN] The status is {}".format(status))

        return status
示例#2
0
class EndpointInterchange:
    """Interchange is a task orchestrator for distributed systems.

    1. Asynchronously queue large volume of tasks (>100K)
    2. Allow for workers to join and leave the union
    3. Detect workers that have failed using heartbeats
    4. Service single and batch requests from workers
    5. Be aware of requests worker resource capacity,
       eg. schedule only jobs that fit into walltime.

    TODO: We most likely need a PUB channel to send out global commandzs, like shutdown
    """
    def __init__(
        self,
        config,
        client_address="127.0.0.1",
        interchange_address="127.0.0.1",
        client_ports: Tuple[int, int, int] = (50055, 50056, 50057),
        launch_cmd=None,
        logdir=".",
        endpoint_id=None,
        keys_dir=".curve",
        suppress_failure=True,
        endpoint_dir=".",
        endpoint_name="default",
        reg_info=None,
        funcx_client_options=None,
        results_ack_handler=None,
    ):
        """
        Parameters
        ----------
        config : funcx.Config object
             Funcx config object that describes how compute should be provisioned

        client_address : str
             The ip address at which the parsl client can be reached.
             Default: "127.0.0.1"

        interchange_address : str
             The ip address at which the workers will be able to reach the Interchange.
             Default: "127.0.0.1"

        client_ports : Tuple[int, int, int]
             The ports at which the client can be reached

        launch_cmd : str
             TODO : update

        logdir : str
             Parsl log directory paths. Logs and temp files go here. Default: '.'

        keys_dir : str
             Directory from where keys used for communicating with the funcX
             service (forwarders) are stored

        endpoint_id : str
             Identity string that identifies the endpoint to the broker

        suppress_failure : Bool
             When set to True, the interchange will attempt to suppress failures.
             Default: False

        endpoint_dir : str
             Endpoint directory path to store registration info in

        endpoint_name : str
             Name of endpoint

        reg_info : Dict
             Registration info from initial registration on endpoint start, if it
             succeeded

        funcx_client_options : Dict
             FuncXClient initialization options
        """
        self.logdir = logdir
        log.info(
            "Initializing EndpointInterchange process with Endpoint ID: {}".
            format(endpoint_id))
        self.config = config
        log.info(f"Got config: {config}")

        self.client_address = client_address
        self.interchange_address = interchange_address
        self.client_ports = client_ports
        self.suppress_failure = suppress_failure

        self.endpoint_dir = endpoint_dir
        self.endpoint_name = endpoint_name

        if funcx_client_options is None:
            funcx_client_options = {}
        self.funcx_client = FuncXClient(**funcx_client_options)

        self.initial_registration_complete = False
        if reg_info:
            self.initial_registration_complete = True
            self.apply_reg_info(reg_info)

        self.heartbeat_period = self.config.heartbeat_period
        self.heartbeat_threshold = self.config.heartbeat_threshold
        # initalize the last heartbeat time to start the loop
        self.last_heartbeat = time.time()
        self.keys_dir = keys_dir
        self.serializer = FuncXSerializer()

        self.pending_task_queue = Queue()
        self.containers = {}
        self.total_pending_task_count = 0

        self._quiesce_event = threading.Event()
        self._kill_event = threading.Event()

        self.results_ack_handler = results_ack_handler

        log.info(f"Interchange address is {self.interchange_address}")

        self.endpoint_id = endpoint_id

        self.current_platform = {
            "parsl_v":
            PARSL_VERSION,
            "python_v":
            "{}.{}.{}".format(sys.version_info.major, sys.version_info.minor,
                              sys.version_info.micro),
            "libzmq_v":
            zmq.zmq_version(),
            "pyzmq_v":
            zmq.pyzmq_version(),
            "os":
            platform.system(),
            "hname":
            platform.node(),
            "funcx_sdk_version":
            funcx_sdk_version,
            "funcx_endpoint_version":
            funcx_endpoint_version,
            "registration":
            self.endpoint_id,
            "dir":
            os.getcwd(),
        }

        log.info(f"Platform info: {self.current_platform}")
        try:
            self.load_config()
        except Exception:
            log.exception("Caught exception")
            raise

        self.tasks = set()
        self.task_status_deltas = {}

        self._test_start = False

    def load_config(self):
        """Load the config"""
        log.info("Loading endpoint local config")

        self.results_passthrough = mpQueue()
        self.executors = {}
        for executor in self.config.executors:
            log.info(f"Initializing executor: {executor.label}")
            executor.funcx_service_address = self.config.funcx_service_address
            if not executor.endpoint_id:
                executor.endpoint_id = self.endpoint_id
            else:
                if not executor.endpoint_id == self.endpoint_id:
                    raise Exception("InconsistentEndpointId")
            self.executors[executor.label] = executor
            if executor.run_dir is None:
                executor.run_dir = self.logdir

    def start_executors(self):
        log.info("Starting Executors")
        for executor in self.config.executors:
            if hasattr(executor,
                       "passthrough") and executor.passthrough is True:
                executor.start(results_passthrough=self.results_passthrough)

    def apply_reg_info(self, reg_info):
        self.client_address = reg_info["public_ip"]
        self.client_ports = (
            reg_info["tasks_port"],
            reg_info["results_port"],
            reg_info["commands_port"],
        )

    def register_endpoint(self):
        reg_info = register_endpoint(self.funcx_client, self.endpoint_id,
                                     self.endpoint_dir, self.endpoint_name)
        self.apply_reg_info(reg_info)
        return reg_info

    def migrate_tasks_to_internal(self, quiesce_event):
        """Pull tasks from the incoming tasks 0mq pipe onto the internal
        pending task queue

        Parameters:
        -----------
        quiesce_event : threading.Event
              Event to let the thread know when it is time to die.
        """
        log.info("[TASK_PULL_THREAD] Starting")

        try:
            self._task_puller_loop(quiesce_event)
        except Exception:
            log.exception("[TASK_PULL_THREAD] Unhandled exception")
        finally:
            quiesce_event.set()
            self.task_incoming.close()
            log.info("[TASK_PULL_THREAD] Thread loop exiting")

    def _task_puller_loop(self, quiesce_event):
        task_counter = 0
        # Create the incoming queue in the thread to keep
        # zmq.context in the same thread. zmq.context is not thread-safe
        self.task_incoming = TaskQueue(
            self.client_address,
            port=self.client_ports[0],
            identity=self.endpoint_id,
            mode="client",
            set_hwm=True,
            keys_dir=self.keys_dir,
            RCVTIMEO=1000,
            linger=0,
        )

        self.task_incoming.put("forwarder",
                               pickle.dumps(self.current_platform))
        log.info(
            f"Task incoming on tcp://{self.client_address}:{self.client_ports[0]}"
        )

        self.last_heartbeat = time.time()

        while not quiesce_event.is_set():

            try:
                if int(time.time() -
                       self.last_heartbeat) > self.heartbeat_threshold:
                    log.critical(
                        "[TASK_PULL_THREAD] Missed too many heartbeats. "
                        "Setting quiesce event.")
                    quiesce_event.set()
                    break

                try:
                    # TODO : Check the kwarg options for get
                    raw_msg = self.task_incoming.get()[0]
                    self.last_heartbeat = time.time()
                except zmq.Again:
                    # We just timed out while attempting to receive
                    log.debug(
                        "[TASK_PULL_THREAD] {} tasks in internal queue".format(
                            self.total_pending_task_count))
                    continue
                except Exception:
                    log.exception(
                        "[TASK_PULL_THREAD] Unknown exception while waiting for tasks"
                    )

                # YADU: TODO We need to do the routing here
                try:
                    msg = Message.unpack(raw_msg)
                except Exception:
                    log.exception(
                        "[TASK_PULL_THREAD] Failed to unpack message from forwarder"
                    )
                    pass

                if msg == "STOP":
                    self._kill_event.set()
                    quiesce_event.set()
                    break

                elif isinstance(msg, Heartbeat):
                    log.info(
                        "[TASK_PULL_THREAD] Got heartbeat from funcx-forwarder"
                    )

                elif isinstance(msg, Task):
                    log.info(f"[TASK_PULL_THREAD] Received task:{msg.task_id}")
                    self.pending_task_queue.put(msg)
                    self.total_pending_task_count += 1
                    self.task_status_deltas[
                        msg.task_id] = TaskStatusCode.WAITING_FOR_NODES
                    task_counter += 1
                    log.debug(
                        "[TASK_PULL_THREAD] Task counter:%s Pending Tasks: %s",
                        task_counter,
                        self.total_pending_task_count,
                    )

                elif isinstance(msg, ResultsAck):
                    self.results_ack_handler.ack(msg.task_id)

                else:
                    log.warning(
                        f"[TASK_PULL_THREAD] Unknown message type received: {msg}"
                    )

            except Exception:
                log.exception(
                    "[TASK_PULL_THREAD] Something really bad happened")
                continue

    def get_container(self, container_uuid):
        """Get the container image location if it is not known to the interchange"""
        if container_uuid not in self.containers:
            if container_uuid == "RAW" or not container_uuid:
                self.containers[container_uuid] = "RAW"
            else:
                try:
                    container = self.funcx_client.get_container(
                        container_uuid, self.config.container_type)
                except Exception:
                    log.exception(
                        "[FETCH_CONTAINER] Unable to resolve container location"
                    )
                    self.containers[container_uuid] = "RAW"
                else:
                    log.info(
                        f"[FETCH_CONTAINER] Got container info: {container}")
                    self.containers[container_uuid] = container.get(
                        "location", "RAW")
        return self.containers[container_uuid]

    def _command_server(self, quiesce_event):
        """Command server to run async command to the interchange

        We want to be able to receive the following not yet implemented/updated
        commands:
         - OutstandingCount
         - ListManagers (get outstanding broken down by manager)
         - HoldWorker
         - Shutdown
        """
        log.debug("[COMMAND] Command Server Starting")

        try:
            self._command_server_loop(quiesce_event)
        except Exception:
            log.exception("[COMMAND] Unhandled exception")
        finally:
            quiesce_event.set()
            self.command_channel.close()
            log.info("[COMMAND] Thread loop exiting")

    def _command_server_loop(self, quiesce_event):
        self.command_channel = TaskQueue(
            self.client_address,
            port=self.client_ports[2],
            identity=self.endpoint_id,
            mode="client",
            RCVTIMEO=1000,  # in milliseconds
            keys_dir=self.keys_dir,
            set_hwm=True,
            linger=0,
        )

        # TODO :Register all channels with the authentication string.
        self.command_channel.put(
            "forwarder", pickle.dumps({"registration": self.endpoint_id}))

        while not quiesce_event.is_set():
            try:
                # Wait for 1000 ms
                buffer = self.command_channel.get(timeout=1000)
                log.debug(f"[COMMAND] Received command request {buffer}")
                command = Message.unpack(buffer)
                if command.type not in COMMAND_TYPES:
                    log.error(
                        "Received incorrect message type on command channel")
                    self.command_channel.put(bytes())
                    continue

                if command.type is MessageType.HEARTBEAT_REQ:
                    log.info(
                        "[COMMAND] Received synchonous HEARTBEAT_REQ from hub")
                    log.info(
                        f"[COMMAND] Replying with Heartbeat({self.endpoint_id})"
                    )
                    reply = Heartbeat(self.endpoint_id)

                log.debug(f"[COMMAND] Reply: {reply}")
                self.command_channel.put(reply.pack())

            except zmq.Again:
                # log.debug("[COMMAND] is alive")
                continue

    def quiesce(self):
        """Temporarily stop everything on the interchange in order to reach a consistent
        state before attempting to start again. This must be called on the main thread
        """
        log.info(
            "Interchange Quiesce in progress (stopping and joining all threads)"
        )
        self._quiesce_event.set()
        self._task_puller_thread.join()
        self._command_thread.join()

        log.info("Saving unacked results to disk")
        try:
            self.results_ack_handler.persist()
        except Exception:
            log.exception("Caught exception while saving unacked results")
            log.warning(
                "Interchange will continue without saving unacked results")

        # this must be called last to ensure the next interchange run will occur
        self._quiesce_event.clear()

    def stop(self):
        """Prepare the interchange for shutdown"""
        log.info("Shutting down EndpointInterchange")

        # TODO: shut down executors gracefully

        # kill_event must be set before quiesce_event because we need to guarantee that
        # once the quiesce is complete, the interchange will not try to start again
        self._kill_event.set()
        self._quiesce_event.set()

    def handle_sigterm(self, sig_num, curr_stack_frame):
        log.warning(
            "Received SIGTERM, attempting to save unacked results to disk")
        try:
            self.results_ack_handler.persist()
        except Exception:
            log.exception("Caught exception while saving unacked results")
        else:
            log.info("Unacked results successfully saved to disk")
        sys.exit(1)

    def start(self):
        """Start the Interchange"""
        log.info("Starting EndpointInterchange")

        signal.signal(signal.SIGTERM, self.handle_sigterm)

        self._quiesce_event.clear()
        self._kill_event.clear()

        # NOTE: currently we only start the executors once because
        # the current behavior is to keep them running decoupled while
        # the endpoint is waiting for reconnection
        self.start_executors()

        while not self._kill_event.is_set():
            self._start_threads_and_main()
            self.quiesce()
            # this check is solely for testing to force this loop to only run once
            if self._test_start:
                break

        log.info("EndpointInterchange shutdown complete.")

    def _start_threads_and_main(self):
        # re-register on every loop start
        if not self.initial_registration_complete:
            # Register the endpoint
            log.info("Running endpoint registration retry loop")
            reg_info = retry_call(self.register_endpoint,
                                  delay=10,
                                  max_delay=300,
                                  backoff=1.2)
            log.info("Endpoint registered with UUID: {}".format(
                reg_info["endpoint_id"]))

        self.initial_registration_complete = False

        log.info(
            "Attempting connection to client at {} on ports: {},{},{}".format(
                self.client_address,
                self.client_ports[0],
                self.client_ports[1],
                self.client_ports[2],
            ))

        self._task_puller_thread = threading.Thread(
            target=self.migrate_tasks_to_internal,
            args=(self._quiesce_event, ))
        self._task_puller_thread.start()

        self._command_thread = threading.Thread(target=self._command_server,
                                                args=(self._quiesce_event, ))
        self._command_thread.start()

        try:
            self._main_loop()
        except Exception:
            log.exception("[MAIN] Unhandled exception")
        finally:
            self.results_outgoing.close()
            log.info("[MAIN] Thread loop exiting")

    def _main_loop(self):
        self.results_outgoing = TaskQueue(
            self.client_address,
            port=self.client_ports[1],
            identity=self.endpoint_id,
            mode="client",
            keys_dir=self.keys_dir,
            # Fail immediately if results cannot be sent back
            SNDTIMEO=0,
            set_hwm=True,
            linger=0,
        )
        self.results_outgoing.put(
            "forwarder", pickle.dumps({"registration": self.endpoint_id}))

        # TODO: this resend must happen after any endpoint re-registration to
        # ensure there are not unacked results left
        resend_results_messages = self.results_ack_handler.get_unacked_results_list(
        )
        if len(resend_results_messages) > 0:
            log.info(
                "[MAIN] Resending %s previously unacked results",
                len(resend_results_messages),
            )

        # TODO: this should be a multipart send rather than a loop
        for results in resend_results_messages:
            self.results_outgoing.put("forwarder", results)

        executor = list(self.executors.values())[0]
        last = time.time()

        while not self._quiesce_event.is_set():
            if last + self.heartbeat_threshold < time.time():
                log.debug("[MAIN] alive")
                last = time.time()
                try:
                    # Adding results heartbeat to essentially force a TCP keepalive
                    # without meddling with OS TCP keepalive defaults
                    self.results_outgoing.put("forwarder", b"HEARTBEAT")
                except Exception:
                    log.exception(
                        "[MAIN] Sending heartbeat to the forwarder over the results "
                        "channel has failed")
                    raise

            self.results_ack_handler.check_ack_counts()

            try:
                task = self.pending_task_queue.get(block=True, timeout=0.01)
                executor.submit_raw(task.pack())
            except queue.Empty:
                pass
            except Exception:
                log.exception(
                    "[MAIN] Unhandled issue while waiting for pending tasks")

            try:
                results = self.results_passthrough.get(False, 0.01)

                task_id = results["task_id"]
                if task_id:
                    self.results_ack_handler.put(task_id, results["message"])
                    log.info(f"Passing result to forwarder for task {task_id}")

                # results will be a pickled dict with task_id, container_id,
                # and results/exception
                self.results_outgoing.put("forwarder", results["message"])

            except queue.Empty:
                pass

            except Exception:
                log.exception(
                    "[MAIN] Something broke while forwarding results from executor "
                    "to forwarder queues")
                continue

    def get_status_report(self):
        """Get utilization numbers"""
        total_cores = 0
        total_mem = 0
        core_hrs = 0
        active_managers = 0
        free_capacity = 0
        outstanding_tasks = self.get_total_tasks_outstanding()
        pending_tasks = self.total_pending_task_count
        num_managers = len(self._ready_manager_queue)
        live_workers = self.get_total_live_workers()

        for manager in self._ready_manager_queue:
            total_cores += self._ready_manager_queue[manager]["cores"]
            total_mem += self._ready_manager_queue[manager]["mem"]
            active_dur = abs(time.time() -
                             self._ready_manager_queue[manager]["reg_time"])
            core_hrs += (active_dur * total_cores) / 3600
            if self._ready_manager_queue[manager]["active"]:
                active_managers += 1
            free_capacity += self._ready_manager_queue[manager][
                "free_capacity"]["total_workers"]

        result_package = {
            "task_id": -2,
            "info": {
                "total_cores": total_cores,
                "total_mem": total_mem,
                "new_core_hrs": core_hrs - self.last_core_hr_counter,
                "total_core_hrs": round(core_hrs, 2),
                "managers": num_managers,
                "active_managers": active_managers,
                "total_workers": live_workers,
                "idle_workers": free_capacity,
                "pending_tasks": pending_tasks,
                "outstanding_tasks": outstanding_tasks,
                "worker_mode": self.config.worker_mode,
                "scheduler_mode": self.config.scheduler_mode,
                "scaling_enabled": self.config.scaling_enabled,
                "mem_per_worker": self.config.mem_per_worker,
                "cores_per_worker": self.config.cores_per_worker,
                "prefetch_capacity": self.config.prefetch_capacity,
                "max_blocks": self.config.provider.max_blocks,
                "min_blocks": self.config.provider.min_blocks,
                "max_workers_per_node": self.config.max_workers_per_node,
                "nodes_per_block": self.config.provider.nodes_per_block,
            },
        }

        self.last_core_hr_counter = core_hrs
        return result_package

    def scale_out(self, blocks=1, task_type=None):
        """Scales out the number of blocks by "blocks"

        Raises:
             NotImplementedError
        """
        r = []
        for _i in range(blocks):
            if self.config.provider:
                self._block_counter += 1
                external_block_id = str(self._block_counter)
                if not task_type and self.config.scheduler_mode == "hard":
                    launch_cmd = self.launch_cmd.format(
                        block_id=external_block_id, worker_type="RAW")
                else:
                    launch_cmd = self.launch_cmd.format(
                        block_id=external_block_id, worker_type=task_type)
                if not task_type:
                    internal_block = self.config.provider.submit(launch_cmd, 1)
                else:
                    internal_block = self.config.provider.submit(
                        launch_cmd, 1, task_type)
                log.debug(
                    f"Launched block {external_block_id}->{internal_block}")
                if not internal_block:
                    raise (ScalingFailed(
                        self.provider.label,
                        "Attempts to provision nodes via provider has failed",
                    ))
                self.blocks[external_block_id] = internal_block
                self.block_id_map[internal_block] = external_block_id
            else:
                log.error("No execution provider available")
                r = None
        return r

    def scale_in(self, blocks=None, block_ids=None, task_type=None):
        """Scale in the number of active blocks by specified amount.

        Parameters
        ----------
        blocks : int
            # of blocks to terminate

        block_ids : [str.. ]
            List of external block ids to terminate
        """
        if block_ids is None:
            block_ids = []
        if task_type:
            log.info(
                "Scaling in blocks of specific task type %s. "
                "Let the provider decide which to kill",
                task_type,
            )
            if self.config.scaling_enabled and self.config.provider:
                to_kill, r = self.config.provider.cancel(blocks, task_type)
                log.info(f"Get the killed blocks: {to_kill}, and status: {r}")
                for job in to_kill:
                    log.info(
                        "[scale_in] Getting the block_id map {} for job {}".
                        format(self.block_id_map, job))
                    block_id = self.block_id_map[job]
                    log.info(f"[scale_in] Holding block {block_id}")
                    self._hold_block(block_id)
                    self.blocks.pop(block_id)
                return r

        if block_ids:
            block_ids_to_kill = block_ids
        else:
            block_ids_to_kill = list(self.blocks.keys())[:blocks]

        # Try a polite terminate
        # TODO : Missing logic to hold blocks
        for block_id in block_ids_to_kill:
            self._hold_block(block_id)

        # Now kill via provider
        to_kill = [self.blocks.pop(bid) for bid in block_ids_to_kill]

        if self.config.scaling_enabled and self.config.provider:
            r = self.config.provider.cancel(to_kill)

        return r

    def provider_status(self):
        """Get status of all blocks from the provider"""
        status = []
        if self.config.provider:
            log.debug("[MAIN] Getting the status of {} blocks.".format(
                list(self.blocks.values())))
            status = self.config.provider.status(list(self.blocks.values()))
            log.debug(f"[MAIN] The status is {status}")

        return status