예제 #1
0
 def fetch_data_from_single_data_server(data_server):
     out_ser = ArrowStreamSerializer()
     import pyarrow as pa
     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
         sock.connect((data_server.host, data_server.port))
         buffer_size = int(os.environ.get("BUFFER_SIZE", 65536))
         infile = os.fdopen(os.dup(sock.fileno()), "rb", buffer_size)
         result = out_ser.load_stream(infile)
         for items in result:
             yield pa.Table.from_batches([items]).to_pandas()
예제 #2
0
 def inner_fetch():
     for data_server in data_servers:
         with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
             out_ser = ArrowStreamSerializer()
             sock.connect((data_server.host, data_server.port))
             buffer_size = int(os.environ.get("BUFFER_SIZE", 65536))
             infile = os.fdopen(os.dup(sock.fileno()), "rb",
                                buffer_size)
             result = out_ser.load_stream(infile)
             for batch in result:
                 yield batch
예제 #3
0
파일: worker.py 프로젝트: pj1987111/pyjava
def main(infile, outfile):
    try:
        try:
            import ray
        except ImportError:
            pass
        # set up memory limits
        memory_limit_mb = int(os.environ.get('PY_EXECUTOR_MEMORY', "-1"))
        if memory_limit_mb > 0 and has_resource_module:
            total_memory = resource.RLIMIT_AS
            try:
                (soft_limit, hard_limit) = resource.getrlimit(total_memory)
                msg = "Current mem limits: {0} of max {1}\n".format(
                    soft_limit, hard_limit)
                print(msg, file=sys.stderr)

                # convert to bytes
                new_limit = memory_limit_mb * 1024 * 1024

                if soft_limit == resource.RLIM_INFINITY or new_limit < soft_limit:
                    msg = "Setting mem limits to {0} of max {1}\n".format(
                        new_limit, new_limit)
                    print(msg, file=sys.stderr)
                    resource.setrlimit(total_memory, (new_limit, new_limit))

            except (resource.error, OSError, ValueError) as e:
                # not all systems support resource limits, so warn instead of failing
                print("WARN: Failed to set memory limit: {0}\n".format(e),
                      file=sys.stderr)
        split_index = read_int(infile)
        print("split_index:%s" % split_index)
        if split_index == -1:  # for unit tests
            sys.exit(-1)

        is_barrier = read_bool(infile)
        bound_port = read_int(infile)

        conf = {}
        for i in range(read_int(infile)):
            k = utf8_deserializer.loads(infile)
            v = utf8_deserializer.loads(infile)
            conf[k] = v

        command = utf8_deserializer.loads(infile)
        ser = ArrowStreamSerializer()

        timezone = conf["timezone"] if "timezone" in conf else None

        out_ser = ArrowStreamPandasSerializer(timezone, True, True)
        is_interactive = os.environ.get('PY_INTERACTIVE', "no") == "yes"
        import uuid
        context_id = str(uuid.uuid4())

        if not os.path.exists(context_id):
            os.mkdir(context_id)

        def process():
            try:
                input_data = ser.load_stream(infile)
                code = CodeCache.get(command)
                if is_interactive:
                    global data_manager
                    global context
                    data_manager = PythonContext(context_id, input_data, conf)
                    context = data_manager
                    global globals_namespace
                    exec(code, globals_namespace, globals_namespace)
                else:
                    data_manager = PythonContext(context_id, input_data, conf)
                    n_local = {
                        "data_manager": data_manager,
                        "context": data_manager
                    }
                    exec(code, n_local, n_local)
                out_iter = data_manager.output()
                write_int(SpecialLengths.START_ARROW_STREAM, outfile)
                out_ser.dump_stream(out_iter, outfile)
            finally:

                try:
                    import shutil
                    shutil.rmtree(context_id)
                except:
                    pass

                try:
                    if hasattr(out_iter, 'close'):
                        out_iter.close()
                except:
                    pass

                try:
                    del data_manager
                except:
                    pass

        process()

    except Exception:
        try:
            write_int(SpecialLengths.ARROW_STREAM_CRASH, outfile)
            write_int(SpecialLengths.PYTHON_EXCEPTION_THROWN, outfile)
            write_with_length(traceback.format_exc().encode("utf-8"), outfile)
        except IOError:
            # JVM close the socket
            pass
        except Exception:
            # Write the error to stderr if it happened while serializing
            print("Py worker failed with exception:", file=sys.stderr)
            print(traceback.format_exc(), file=sys.stderr)
        sys.exit(-1)

    write_int(SpecialLengths.END_OF_DATA_SECTION, outfile)
    flag = read_int(infile)
    if flag == SpecialLengths.END_OF_STREAM:
        write_int(SpecialLengths.END_OF_STREAM, outfile)
    else:
        # write a different value to tell JVM to not reuse this worker
        write_int(SpecialLengths.END_OF_DATA_SECTION, outfile)
        sys.exit(-1)