def run(graph_filename=None, start_port=9000): """Instantiates and runs the dataflow graph. ERDOS will spawn 1 process for each python operator, and connect them via TCP. Args: graph_filename (str): the filename to which to write the dataflow graph as a DOT file. start_port (int): the port on which to start. The start port is the lowest port ERDOS will use to establish TCP connections between operators. """ data_addresses = [ "127.0.0.1:{port}".format(port=start_port + i) for i in range(_num_py_operators + 1) ] control_addresses = [ "127.0.0.1:{port}".format(port=start_port + len(data_addresses) + i) for i in range(_num_py_operators + 1) ] def runner(node_id, data_addresses, control_addresses): _internal.run(node_id, data_addresses, control_addresses) processes = [ mp.Process(target=runner, args=(i, data_addresses, control_addresses)) for i in range(1, _num_py_operators + 1) ] for p in processes: p.start() # Needed to shut down child processes def sigint_handler(sig, frame): for p in processes: p.terminate() sys.exit(0) signal.signal(signal.SIGINT, sigint_handler) # The driver must always be on node 0 otherwise ingest and extract streams # will break _internal.run_async(0, data_addresses, control_addresses, graph_filename) for p in processes: p.join()
def run_async(graph_filename: Optional[str] = None, start_port: Optional[int] = 9000) -> NodeHandle: """Instantiates and runs the dataflow graph asynchronously. ERDOS will spawn 1 process for each python operator, and connect them via TCP. Args: graph_filename: The filename to which to write the dataflow graph as a DOT file. start_port: The port on which to start. The start port is the lowest port ERDOS will use to establish TCP connections between operators. Returns: A :py:class:`.NodeHandle` that allows the driver to interface with the dataflow graph. """ data_addresses = [ "127.0.0.1:{port}".format(port=start_port + i) for i in range(_num_py_operators + 1) ] control_addresses = [ "127.0.0.1:{port}".format(port=start_port + len(data_addresses) + i) for i in range(_num_py_operators + 1) ] logger.debug( "Running the dataflow graph on addresses: {}".format(data_addresses)) # Fix for macOS where mulitprocessing defaults # to spawn() instead of fork() in Python 3.8+ # https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods # Warning: may lead to crashes # https://bugs.python.org/issue33725 ctx = mp.get_context("fork") processes = [ ctx.Process(target=_run_node, args=(i, data_addresses, control_addresses)) for i in range(1, _num_py_operators + 1) ] # Needed to shut down child processes def sigint_handler(sig, frame): for p in processes: p.terminate() sys.exit(0) signal.signal(signal.SIGINT, sigint_handler) for p in processes: p.start() # The driver must always be on node 0 otherwise ingest and extract streams # will break py_node_handle = _internal.run_async(0, data_addresses, control_addresses, graph_filename) return NodeHandle(py_node_handle, processes)
def run_async(driver, start_port=9000): """Instantiates and runs the dataflow graph asynchronously. ERDOS will spawn 1 process for each python operator, and connect them via TCP. Args: driver (function): function that builds the dataflow graph. This must be passed as a function so it can run on all ERDOS processes. start_port (int): the port on which to start. The start port is the lowest port ERDOS will use to establish TCP connections between operators. """ results = driver() # run driver to set _num_py_operators addresses = [ "127.0.0.1:{port}".format(port=start_port + i) for i in range(_num_py_operators + 1) # Add 1 for the driver ] def runner(driver, node_id, addresses): driver() _internal.run(node_id, addresses) processes = [ mp.Process(target=runner, args=(driver, i + 1, addresses)) # Add 1 b/c driver is node 0 for i in range(_num_py_operators) ] _internal.run_async(0, addresses) for p in processes: p.start() # Needed to shut down child processes def sigint_handler(sig, frame): for p in processes: p.terminate() sys.exit(0) signal.signal(signal.SIGINT, sigint_handler) return results
def run_async(graph_filename: Optional[str] = None, start_port: Optional[int] = 9000) -> NodeHandle: """Instantiates and runs the dataflow graph asynchronously. ERDOS will spawn 1 process for each python operator, and connect them via TCP. Args: graph_filename: The filename to which to write the dataflow graph as a DOT file. start_port: The port on which to start. The start port is the lowest port ERDOS will use to establish TCP connections between operators. Returns: A :py:class:`.NodeHandle` that allows the driver to interface with the dataflow graph. """ data_addresses = [ "127.0.0.1:{port}".format(port=start_port + i) for i in range(_num_py_operators + 1) ] control_addresses = [ "127.0.0.1:{port}".format(port=start_port + len(data_addresses) + i) for i in range(_num_py_operators + 1) ] logger.debug( "Running the dataflow graph on addresses: {}".format(data_addresses)) def runner(node_id, data_addresses, control_addresses): _internal.run(node_id, data_addresses, control_addresses) processes = [ mp.Process(target=runner, args=(i, data_addresses, control_addresses)) for i in range(1, _num_py_operators + 1) ] # Needed to shut down child processes def sigint_handler(sig, frame): for p in processes: p.terminate() sys.exit(0) signal.signal(signal.SIGINT, sigint_handler) for p in processes: p.start() # The driver must always be on node 0 otherwise ingest and extract streams # will break py_node_handle = _internal.run_async(0, data_addresses, control_addresses, graph_filename) return NodeHandle(py_node_handle, processes)