def fetch_data_from_single_data_server(data_server): out_ser = ArrowStreamSerializer() import pyarrow as pa with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: sock.connect((data_server.host, data_server.port)) buffer_size = int(os.environ.get("BUFFER_SIZE", 65536)) infile = os.fdopen(os.dup(sock.fileno()), "rb", buffer_size) result = out_ser.load_stream(infile) for items in result: yield pa.Table.from_batches([items]).to_pandas()
def inner_fetch(): for data_server in data_servers: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: out_ser = ArrowStreamSerializer() sock.connect((data_server.host, data_server.port)) buffer_size = int(os.environ.get("BUFFER_SIZE", 65536)) infile = os.fdopen(os.dup(sock.fileno()), "rb", buffer_size) result = out_ser.load_stream(infile) for batch in result: yield batch
def main(infile, outfile): try: try: import ray except ImportError: pass # set up memory limits memory_limit_mb = int(os.environ.get('PY_EXECUTOR_MEMORY', "-1")) if memory_limit_mb > 0 and has_resource_module: total_memory = resource.RLIMIT_AS try: (soft_limit, hard_limit) = resource.getrlimit(total_memory) msg = "Current mem limits: {0} of max {1}\n".format( soft_limit, hard_limit) print(msg, file=sys.stderr) # convert to bytes new_limit = memory_limit_mb * 1024 * 1024 if soft_limit == resource.RLIM_INFINITY or new_limit < soft_limit: msg = "Setting mem limits to {0} of max {1}\n".format( new_limit, new_limit) print(msg, file=sys.stderr) resource.setrlimit(total_memory, (new_limit, new_limit)) except (resource.error, OSError, ValueError) as e: # not all systems support resource limits, so warn instead of failing print("WARN: Failed to set memory limit: {0}\n".format(e), file=sys.stderr) split_index = read_int(infile) print("split_index:%s" % split_index) if split_index == -1: # for unit tests sys.exit(-1) is_barrier = read_bool(infile) bound_port = read_int(infile) conf = {} for i in range(read_int(infile)): k = utf8_deserializer.loads(infile) v = utf8_deserializer.loads(infile) conf[k] = v command = utf8_deserializer.loads(infile) ser = ArrowStreamSerializer() timezone = conf["timezone"] if "timezone" in conf else None out_ser = ArrowStreamPandasSerializer(timezone, True, True) is_interactive = os.environ.get('PY_INTERACTIVE', "no") == "yes" import uuid context_id = str(uuid.uuid4()) if not os.path.exists(context_id): os.mkdir(context_id) def process(): try: input_data = ser.load_stream(infile) code = CodeCache.get(command) if is_interactive: global data_manager global context data_manager = PythonContext(context_id, input_data, conf) context = data_manager global globals_namespace exec(code, globals_namespace, globals_namespace) else: data_manager = PythonContext(context_id, input_data, conf) n_local = { "data_manager": data_manager, "context": data_manager } exec(code, n_local, n_local) out_iter = data_manager.output() write_int(SpecialLengths.START_ARROW_STREAM, outfile) out_ser.dump_stream(out_iter, outfile) finally: try: import shutil shutil.rmtree(context_id) except: pass try: if hasattr(out_iter, 'close'): out_iter.close() except: pass try: del data_manager except: pass process() except Exception: try: write_int(SpecialLengths.ARROW_STREAM_CRASH, outfile) write_int(SpecialLengths.PYTHON_EXCEPTION_THROWN, outfile) write_with_length(traceback.format_exc().encode("utf-8"), outfile) except IOError: # JVM close the socket pass except Exception: # Write the error to stderr if it happened while serializing print("Py worker failed with exception:", file=sys.stderr) print(traceback.format_exc(), file=sys.stderr) sys.exit(-1) write_int(SpecialLengths.END_OF_DATA_SECTION, outfile) flag = read_int(infile) if flag == SpecialLengths.END_OF_STREAM: write_int(SpecialLengths.END_OF_STREAM, outfile) else: # write a different value to tell JVM to not reuse this worker write_int(SpecialLengths.END_OF_DATA_SECTION, outfile) sys.exit(-1)