Exemplo n.º 1
0
def worker(comm, rank):
    logger.info("Worker started")

    # Sync worker with master
    comm.Barrier()
    logger.debug("Synced")

    task_request = b'TREQ'

    while True:
        comm.send(task_request, dest=0, tag=TASK_REQUEST_TAG)
        # The worker will receive {'task_id':<tid>, 'buffer':<buf>}
        req = comm.recv(source=0, tag=rank)
        logger.debug("Got req: {}".format(req))
        tid = req['task_id']
        logger.debug("Got task: {}".format(tid))

        try:
            result = execute_task(req['buffer'])
        except Exception as e:
            result_package = {'task_id': tid, 'exception': serialize(RemoteExceptionWrapper(*sys.exc_info()))}
            logger.debug("No result due to exception: {} with result package {}".format(e, result_package))
        else:
            result_package = {'task_id': tid, 'result': serialize(result)}
            logger.debug("Result: {}".format(result))

        pkl_package = pickle.dumps(result_package)
        comm.send(pkl_package, dest=0, tag=RESULT_TAG)
Exemplo n.º 2
0
def worker(worker_id, task_url, debug=True, logdir="workers", uid="1"):
    """ TODO: docstring

    TODO : Cleanup debug, logdir and uid to function correctly
    """

    start_file_logger('{}/{}/worker_{}.log'.format(logdir, uid, worker_id),
                      0,
                      level=logging.DEBUG if debug is True else logging.INFO)

    logger.info("Starting worker {}".format(worker_id))

    task_ids_received = []

    message_q = zmq_pipes.WorkerMessages(task_url)

    while True:
        print("Worker loop iteration starting")
        task_id, buf = message_q.get()
        task_ids_received.append(task_id)

        user_ns = locals()
        user_ns.update({'__builtins__': __builtins__})
        f, args, kwargs = unpack_apply_message(buf, user_ns, copy=False)

        logger.debug("Worker {} received task {}".format(worker_id, task_id))
        result = execute_task(f, args, kwargs, user_ns)
        logger.debug("Worker {} completed task {}".format(worker_id, task_id))

        reply = {"result": result, "worker_id": worker_id}
        message_q.put(task_id, serialize(reply))
        logger.debug("Result sent")
Exemplo n.º 3
0
def main():
    """Execute one rank of an MPI application."""
    logging.basicConfig(level=logging.INFO,
                        format="%(asctime)s [%(levelname)s] %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument("-i",
                        "--input",
                        required=True,
                        help="Input pickle file")
    parser.add_argument("-o",
                        "--output",
                        required=True,
                        help="Output pickle file")
    args = parser.parse_args()
    logging.info("Input : %s", args.input)
    logging.info("Output : %s", args.output)
    returnval = None
    exception = None
    # open and deserialize the task's pickled input package
    with open(args.input, "rb") as file_handle:
        fn_buf = file_handle.read()
    logging.info("Read input pickle file")
    try:
        returnval = execute_task(fn_buf)
    except Exception as exc:
        logging.exception("Parsl task execution failed:")
        exception = exc
    else:
        logging.info("Finished execution")
    # only rank 0 should write/return a result; other ranks exit
    if int(os.environ["FLUX_TASK_RANK"]) == 0:
        # write the result to the output file
        result_buf = serialize(TaskResult(returnval, exception))
        with open(args.output, "wb") as file_handle:
            file_handle.write(result_buf)
Exemplo n.º 4
0
    def worker_watchdog(self, kill_event):
        """ Listens on the pending_result_queue and sends out results via 0mq

        Parameters:
        -----------
        kill_event : threading.Event
              Event to let the thread know when it is time to die.
        """

        logger.debug("[WORKER_WATCHDOG_THREAD] Starting thread")

        while not kill_event.is_set():
            for worker_id, p in self.procs.items():
                if not p.is_alive():
                    logger.info(
                        "[WORKER_WATCHDOG_THREAD] Worker {} has died".format(
                            worker_id))
                    try:
                        task = self._tasks_in_progress.pop(worker_id)
                        logger.info(
                            "[WORKER_WATCHDOG_THREAD] Worker {} was busy when it died"
                            .format(worker_id))
                        try:
                            raise WorkerLost(worker_id, platform.node())
                        except Exception:
                            logger.info(
                                "[WORKER_WATCHDOG_THREAD] Putting exception for task {} in the pending result queue"
                                .format(task['task_id']))
                            result_package = {
                                'task_id':
                                task['task_id'],
                                'exception':
                                serialize(
                                    RemoteExceptionWrapper(*sys.exc_info()))
                            }
                            pkl_package = pickle.dumps(result_package)
                            self.pending_result_queue.put(pkl_package)
                    except KeyError:
                        logger.info(
                            "[WORKER_WATCHDOG_THREAD] Worker {} was not busy when it died"
                            .format(worker_id))

                    p = multiprocessing.Process(
                        target=worker,
                        args=(worker_id, self.uid, self.worker_count,
                              self.pending_task_queue,
                              self.pending_result_queue,
                              self.ready_worker_queue, self._tasks_in_progress,
                              self.cpu_affinity),
                        name="HTEX-Worker-{}".format(worker_id))
                    self.procs[worker_id] = p
                    logger.info(
                        "[WORKER_WATCHDOG_THREAD] Worker {} has been restarted"
                        .format(worker_id))
                time.sleep(self.poll_period)

        logger.critical("[WORKER_WATCHDOG_THREAD] Exiting")
Exemplo n.º 5
0
def id_for_memo_tuple(denormalized_tuple, output_ref=False):
    if type(denormalized_tuple) != tuple:
        raise ValueError("id_for_memo_tuple cannot work on subclasses of tuple")

    normalized_list = []

    for e in denormalized_tuple:
        normalized_list.append(id_for_memo(e, output_ref=output_ref))

    return serialize(normalized_list)
Exemplo n.º 6
0
def id_for_memo_list(denormalized_list, output_ref=False):
    if type(denormalized_list) != list:
        raise ValueError("id_for_memo_list cannot work on subclasses of list")

    normalized_list = []

    for e in denormalized_list:
        normalized_list.append(id_for_memo(e, output_ref=output_ref))

    return serialize(normalized_list)
Exemplo n.º 7
0
def id_for_memo_dict(denormalized_dict, output_ref=False):
    """This normalises the keys and values of the supplied dictionary.

    When output_ref=True, the values are normalised as output refs, but
    the keys are not.
    """
    if type(denormalized_dict) != dict:
        raise ValueError("id_for_memo_dict cannot work on subclasses of dict")

    keys = sorted(denormalized_dict)

    normalized_list = []
    for k in keys:
        normalized_list.append(id_for_memo(k))
        normalized_list.append(id_for_memo(denormalized_dict[k], output_ref=output_ref))
    return serialize(normalized_list)
Exemplo n.º 8
0
def id_for_memo_function(function, output_ref=False):
    """This produces function hash material using the source definition of the
       function.

       The standard serialize_object based approach cannot be used as it is
       too sensitive to irrelevant facts such as the source line, meaning
       a whitespace line added at the top of a source file will cause the hash
       to change.
    """
    logger.debug(
        "serialising id_for_memo_function for function {}, type {}".format(
            function, type(function)))
    try:
        fn_source = getsource(function)
    except Exception as e:
        logger.warning(
            "Unable to get source code for app caching. Recommend creating module. Exception was: {}"
            .format(e))
        fn_source = function.__name__
    return serialize(fn_source.encode('utf-8'))
Exemplo n.º 9
0
    def send(self, message: object) -> None:
        logger.info("Sending a monitoring message via filesystem")

        # this should be randomised by things like worker ID, process ID, whatever
        # because there will in general be many FilesystemRadio objects sharing the
        # same space (even from the same process). id(self) used here will
        # disambiguate in one process at one instant, but not between
        # other things: eg different hosts, different processes, same process different non-overlapping instantiations
        unique_id = f"msg-{self.radio_uid}-{self.id_counter}"

        self.id_counter = self.id_counter + 1

        tmp_filename = f"{self.tmp_path}/{unique_id}"
        new_filename = f"{self.new_path}/{unique_id}"
        buffer = (message, "NA")

        # this will write the message out then atomically
        # move it into new/, so that a partially written
        # file will never be observed in new/
        with open(tmp_filename, "wb") as f:
            f.write(serialize(buffer))
        os.rename(tmp_filename, new_filename)
Exemplo n.º 10
0
def runner(incoming_q, outgoing_q):
    """This is a function that mocks the Swift-T side.

    It listens on the the incoming_q for tasks and posts returns on the outgoing_q.

    Args:
         - incoming_q (Queue object) : The queue to listen on
         - outgoing_q (Queue object) : Queue to post results on

    The messages posted on the incoming_q will be of the form :

    .. code:: python

       {
          "task_id" : <uuid.uuid4 string>,
          "buffer"  : serialized buffer containing the fn, args and kwargs
       }

    If ``None`` is received, the runner will exit.

    Response messages should be of the form:

    .. code:: python

       {
          "task_id" : <uuid.uuid4 string>,
          "result"  : serialized buffer containing result
          "exception" : serialized exception object
       }

    On exiting the runner will post ``None`` to the outgoing_q

    """
    logger.debug("[RUNNER] Starting")

    def execute_task(bufs):
        """Deserialize the buffer and execute the task.

        Returns the serialized result or exception.
        """
        user_ns = locals()
        user_ns.update({'__builtins__': __builtins__})

        f, args, kwargs = unpack_apply_message(bufs, user_ns, copy=False)

        fname = getattr(f, '__name__', 'f')
        prefix = "parsl_"
        fname = prefix + "f"
        argname = prefix + "args"
        kwargname = prefix + "kwargs"
        resultname = prefix + "result"

        user_ns.update({
            fname: f,
            argname: args,
            kwargname: kwargs,
            resultname: resultname
        })

        code = "{0} = {1}(*{2}, **{3})".format(resultname, fname, argname,
                                               kwargname)

        try:
            logger.debug("[RUNNER] Executing: {0}".format(code))
            exec(code, user_ns, user_ns)

        except Exception as e:
            logger.warning("Caught exception; will raise it: {}".format(e))
            raise e

        else:
            logger.debug("[RUNNER] Result: {0}".format(
                user_ns.get(resultname)))
            return user_ns.get(resultname)

    while True:
        try:
            # Blocking wait on the queue
            msg = incoming_q.get(block=True, timeout=10)

        except queue.Empty:
            # Handle case where no items were in the queue
            logger.debug("[RUNNER] Queue is empty")

        except IOError as e:
            logger.debug("[RUNNER] Broken pipe: {}".format(e))
            try:
                # Attempt to send a stop notification to the management thread
                outgoing_q.put(None)

            except Exception:
                pass

            break

        except Exception as e:
            logger.debug("[RUNNER] Caught unknown exception: {}".format(e))

        else:
            # Handle received message
            if not msg:
                # Empty message is a die request
                logger.debug("[RUNNER] Received exit request")
                outgoing_q.put(None)
                break
            else:
                # Received a valid message, handle it
                logger.debug("[RUNNER] Got a valid task with ID {}".format(
                    msg["task_id"]))
                try:
                    response_obj = execute_task(msg['buffer'])
                    response = {
                        "task_id": msg["task_id"],
                        "result": serialize(response_obj)
                    }

                    logger.debug("[RUNNER] Returing result: {}".format(
                        deserialize(response["result"])))

                except Exception as e:
                    logger.debug(
                        "[RUNNER] Caught task exception: {}".format(e))
                    response = {
                        "task_id": msg["task_id"],
                        "exception": serialize(e)
                    }

                outgoing_q.put(response)

    logger.debug("[RUNNER] Terminating")
Exemplo n.º 11
0
def id_for_memo_serialize(obj, output_ref=False):
    return serialize(obj)
Exemplo n.º 12
0
def worker(worker_id, pool_id, pool_size, task_queue, result_queue, worker_queue, tasks_in_progress, cpu_affinity):
    """

    Put request token into queue
    Get task from task_queue
    Pop request from queue
    Put result into result_queue
    """
    start_file_logger('{}/block-{}/{}/worker_{}.log'.format(args.logdir, args.block_id, pool_id, worker_id),
                      worker_id,
                      name="worker_log",
                      level=logging.DEBUG if args.debug else logging.INFO)

    # Store worker ID as an environment variable
    os.environ['PARSL_WORKER_RANK'] = str(worker_id)
    os.environ['PARSL_WORKER_COUNT'] = str(pool_size)
    os.environ['PARSL_WORKER_POOL_ID'] = str(pool_id)
    os.environ['PARSL_WORKER_BLOCK_ID'] = str(args.block_id)

    # Sync worker with master
    logger.info('Worker {} started'.format(worker_id))
    if args.debug:
        logger.debug("Debug logging enabled")

    # If desired, set process affinity
    if cpu_affinity != "none":
        # Count the number of cores per worker
        avail_cores = sorted(os.sched_getaffinity(0))  # Get the available processors
        cores_per_worker = len(avail_cores) // pool_size
        assert cores_per_worker > 0, "Affinity does not work if there are more workers than cores"

        # Determine this worker's cores
        if cpu_affinity == "block":
            my_cores = avail_cores[cores_per_worker * worker_id:cores_per_worker * (worker_id + 1)]
        elif cpu_affinity == "alternating":
            my_cores = avail_cores[worker_id::pool_size]
        else:
            raise ValueError("Affinity strategy {} is not supported".format(cpu_affinity))

        # Set the affinity for this worker
        os.sched_setaffinity(0, my_cores)
        logger.info("Set worker CPU affinity to {}".format(my_cores))

    while True:
        worker_queue.put(worker_id)

        # The worker will receive {'task_id':<tid>, 'buffer':<buf>}
        req = task_queue.get()
        tasks_in_progress[worker_id] = req
        tid = req['task_id']
        logger.info("Received task {}".format(tid))

        try:
            worker_queue.get()
        except queue.Empty:
            logger.warning("Worker ID: {} failed to remove itself from ready_worker_queue".format(worker_id))
            pass

        try:
            result = execute_task(req['buffer'])
            serialized_result = serialize(result, buffer_threshold=1e6)
        except Exception as e:
            logger.info('Caught an exception: {}'.format(e))
            result_package = {'task_id': tid, 'exception': serialize(RemoteExceptionWrapper(*sys.exc_info()))}
        else:
            result_package = {'task_id': tid, 'result': serialized_result}
            # logger.debug("Result: {}".format(result))

        logger.info("Completed task {}".format(tid))
        try:
            pkl_package = pickle.dumps(result_package)
        except Exception:
            logger.exception("Caught exception while trying to pickle the result package")
            pkl_package = pickle.dumps({'task_id': tid,
                                        'exception': serialize(RemoteExceptionWrapper(*sys.exc_info()))
            })

        result_queue.put(pkl_package)
        tasks_in_progress.pop(worker_id)
Exemplo n.º 13
0
def worker(worker_id, pool_id, pool_size, task_queue, result_queue,
           worker_queue, tasks_in_progress, cpu_affinity,
           accelerator: Optional[str]):
    """

    Put request token into queue
    Get task from task_queue
    Pop request from queue
    Put result into result_queue
    """

    # override the global logger inherited from the __main__ process (which
    # usually logs to manager.log) with one specific to this worker.
    global logger
    logger = start_file_logger(
        '{}/block-{}/{}/worker_{}.log'.format(args.logdir, args.block_id,
                                              pool_id, worker_id),
        worker_id,
        name="worker_log",
        level=logging.DEBUG if args.debug else logging.INFO)

    # Store worker ID as an environment variable
    os.environ['PARSL_WORKER_RANK'] = str(worker_id)
    os.environ['PARSL_WORKER_COUNT'] = str(pool_size)
    os.environ['PARSL_WORKER_POOL_ID'] = str(pool_id)
    os.environ['PARSL_WORKER_BLOCK_ID'] = str(args.block_id)

    # share the result queue with monitoring code so it too can send results down that channel
    import parsl.executors.high_throughput.monitoring_info as mi
    mi.result_queue = result_queue

    # Sync worker with master
    logger.info('Worker {} started'.format(worker_id))
    if args.debug:
        logger.debug("Debug logging enabled")

    # If desired, set process affinity
    if cpu_affinity != "none":
        # Count the number of cores per worker
        avail_cores = sorted(
            os.sched_getaffinity(0))  # Get the available processors
        cores_per_worker = len(avail_cores) // pool_size
        assert cores_per_worker > 0, "Affinity does not work if there are more workers than cores"

        # Determine this worker's cores
        if cpu_affinity == "block":
            my_cores = avail_cores[cores_per_worker *
                                   worker_id:cores_per_worker *
                                   (worker_id + 1)]
        elif cpu_affinity == "alternating":
            my_cores = avail_cores[worker_id::pool_size]
        else:
            raise ValueError(
                "Affinity strategy {} is not supported".format(cpu_affinity))

        # Set the affinity for this worker
        os.sched_setaffinity(0, my_cores)
        logger.info("Set worker CPU affinity to {}".format(my_cores))

    # If desired, pin to accelerator
    if accelerator is not None:
        os.environ["CUDA_VISIBLE_DEVICES"] = accelerator
        os.environ["ROCR_VISIBLE_DEVICES"] = accelerator
        os.environ["SYCL_DEVICE_FILTER"] = f"*:*:{accelerator}"
        logger.info(f'Pinned worker to accelerator: {accelerator}')

    while True:
        worker_queue.put(worker_id)

        # The worker will receive {'task_id':<tid>, 'buffer':<buf>}
        req = task_queue.get()
        tasks_in_progress[worker_id] = req
        tid = req['task_id']
        logger.info("Received task {}".format(tid))

        try:
            worker_queue.get()
        except queue.Empty:
            logger.warning(
                "Worker ID: {} failed to remove itself from ready_worker_queue"
                .format(worker_id))
            pass

        try:
            result = execute_task(req['buffer'])
            serialized_result = serialize(result, buffer_threshold=1e6)
        except Exception as e:
            logger.info('Caught an exception: {}'.format(e))
            result_package = {
                'type': 'result',
                'task_id': tid,
                'exception': serialize(RemoteExceptionWrapper(*sys.exc_info()))
            }
        else:
            result_package = {
                'type': 'result',
                'task_id': tid,
                'result': serialized_result
            }
            # logger.debug("Result: {}".format(result))

        logger.info("Completed task {}".format(tid))
        try:
            pkl_package = pickle.dumps(result_package)
        except Exception:
            logger.exception(
                "Caught exception while trying to pickle the result package")
            pkl_package = pickle.dumps({
                'type':
                'result',
                'task_id':
                tid,
                'exception':
                serialize(RemoteExceptionWrapper(*sys.exc_info()))
            })

        result_queue.put(pkl_package)
        tasks_in_progress.pop(worker_id)
        logger.info("All processing finished for task {}".format(tid))
Exemplo n.º 14
0
def id_for_memo_serialize(obj: object, output_ref: bool = False) -> bytes:
    return serialize(obj)
Exemplo n.º 15
0
import argparse

if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input", required=True,
                        help="Input pickle file")
    parser.add_argument("-o", "--output", required=True,
                        help="Output pickle file")
    args = parser.parse_args()

    print(f"Input : {args.input}")
    print(f"Output : {args.output}")

    result = None
    with open(args.input, 'rb') as f:
        fn_buf = f.read()
        print("Read input pkl file")
        try:
            result = execute_task(fn_buf)
            print("Finished execution")
        except Exception as e:
            print(f"Execution failed due to {e}")
            result = e
    result_buf = serialize(result)
    with open(args.output, 'wb') as f:
        f.write(result_buf)

    
    
Exemplo n.º 16
0
def worker(worker_id, pool_id, pool_size, task_queue, result_queue,
           worker_queue, tasks_in_progress):
    """

    Put request token into queue
    Get task from task_queue
    Pop request from queue
    Put result into result_queue
    """
    start_file_logger('{}/block-{}/{}/worker_{}.log'.format(
        args.logdir, args.block_id, pool_id, worker_id),
                      worker_id,
                      name="worker_log",
                      level=logging.DEBUG if args.debug else logging.INFO)

    # Store worker ID as an environment variable
    os.environ['PARSL_WORKER_RANK'] = str(worker_id)
    os.environ['PARSL_WORKER_COUNT'] = str(pool_size)
    os.environ['PARSL_WORKER_POOL_ID'] = str(pool_id)

    # Sync worker with master
    logger.info('Worker {} started'.format(worker_id))
    if args.debug:
        logger.debug("Debug logging enabled")

    while True:
        worker_queue.put(worker_id)

        # The worker will receive {'task_id':<tid>, 'buffer':<buf>}
        req = task_queue.get()
        tasks_in_progress[worker_id] = req
        tid = req['task_id']
        logger.info("Received task {}".format(tid))

        try:
            worker_queue.get()
        except queue.Empty:
            logger.warning(
                "Worker ID: {} failed to remove itself from ready_worker_queue"
                .format(worker_id))
            pass

        try:
            result = execute_task(req['buffer'])
            serialized_result = serialize(result, buffer_threshold=1e6)
        except Exception as e:
            logger.info('Caught an exception: {}'.format(e))
            result_package = {
                'task_id': tid,
                'exception': serialize(RemoteExceptionWrapper(*sys.exc_info()))
            }
        else:
            result_package = {'task_id': tid, 'result': serialized_result}
            # logger.debug("Result: {}".format(result))

        logger.info("Completed task {}".format(tid))
        try:
            pkl_package = pickle.dumps(result_package)
        except Exception:
            logger.exception(
                "Caught exception while trying to pickle the result package")
            pkl_package = pickle.dumps({
                'task_id':
                tid,
                'exception':
                serialize(RemoteExceptionWrapper(*sys.exc_info()))
            })

        result_queue.put(pkl_package)
        tasks_in_progress.pop(worker_id)