예제 #1
0
def read_vineyard_dataframe(vineyard_socket, path, storage_options,
                            read_options, proc_num, proc_index):
    client = vineyard.connect(vineyard_socket)
    params = dict()
    if storage_options:
        raise ValueError("Read vineyard current not support storage options")
    params["header_row"] = "1" if read_options.get("header_row",
                                                   False) else "0"
    params["delimiter"] = bytes(read_options.get("delimiter", ","),
                                "utf-8").decode("unicode_escape")

    stream = DataframeStream.new(client, params)
    client.persist(stream.id)
    report_success(stream.id)

    name = urlparse(path).netloc
    # the "name" part in URL can be a name, or an ObjectID for convenience.
    try:
        df_id = client.get_name(name)
    except Exception:
        df_id = vineyard.ObjectID(name)
    dataframes = client.get(df_id)

    writer: DataframeStream.Writer = stream.open_writer(client)

    try:
        for df in dataframes:
            batch = pa.RecordBatch.from_pandas(df)
            writer.write(batch)
        writer.finish()
    except Exception:
        report_exception()
        writer.fail()
        sys.exit(-1)
예제 #2
0
def read_byte_stream(
    client,
    fs: AbstractFileSystem,
    stream: ByteStream,
    path: str,
    chunk_size: int = CHUNK_SIZE,
):
    logger.info('start reading blob at %s', path)
    with fs.open(path, mode="rb") as f:
        try:
            total_size = f.size()
        except TypeError:
            total_size = f.size

        writer = stream.open_writer(client)
        try:
            begin, end = 0, total_size
            while begin < end:
                buffer = read_block(f, begin, min(chunk_size, end - begin))
                chunk = writer.next(len(buffer))
                vineyard.memory_copy(chunk, 0, buffer)
                begin += len(buffer)
        except Exception:
            report_exception()
            writer.fail()
            sys.exit(-1)

        writer.finish()
        return total_size
예제 #3
0
def main():
    if len(sys.argv) < 3:
        print("usage: ./serializer <ipc_socket> <object_id>")
        exit(1)
    ipc_socket = sys.argv[1]
    object_id = vineyard.ObjectID(sys.argv[2])
    try:
        serialize(ipc_socket, object_id)
    except Exception:
        report_exception()
        sys.exit(-1)
예제 #4
0
def write_bytes(
    vineyard_socket,
    path,
    stream_id,
    storage_options,
    write_options,
    proc_num,
    proc_index,
):
    """Read bytes from stream and write to external storage.

    Args:
        vineyard_socket (str): Ipc socket
        path (str): External storage path to write to
        stream_id (str): ObjectID of the stream to be read from, which is a
                         ParallelStream
        storage_options (dict): Configurations of external storage
        write_options (dict): Additional options that could control the behavior
                              of write
        proc_num (int): Total amount of process
        proc_index (int): The sequence of this process

    Raises:
        ValueError: If the stream is invalid.
    """
    client = vineyard.connect(vineyard_socket)
    streams = client.get(stream_id)
    if len(streams) != proc_num or streams[proc_index] is None:
        report_error(
            f"Fetch stream error with proc_num = {proc_num}, proc_index = {proc_index}"
        )
        sys.exit(-1)

    instream: ByteStream = streams[proc_index]
    try:
        reader = instream.open_reader(client)
        of = fsspec.open(f"{path}_{proc_index}", "wb", **storage_options)
    except Exception:
        report_exception()
        sys.exit(-1)

    lengths = []  # store lengths of each chunk. may be unused
    with of as f:
        while True:
            try:
                chunk = reader.next()
            except (StopIteration, vineyard.StreamDrainedException):
                break
            lengths.append(len(chunk))
            f.write(bytes(chunk))
예제 #5
0
def main():
    if len(sys.argv) < 5:
        print(
            "usage: ./deserializer <ipc_socket> <object_id> <proc_num> <proc_index>"
        )
        exit(1)
    ipc_socket = sys.argv[1]
    object_id = vineyard.ObjectID(sys.argv[2])
    proc_num = int(sys.argv[3])
    proc_index = int(sys.argv[4])
    try:
        deserialize(ipc_socket, object_id, proc_num, proc_index)
    except Exception:
        report_exception()
        sys.exit(-1)
예제 #6
0
def write_byte_stream(client, stream: ByteStream, prefix: str,
                      storage_options: Dict):
    path = stream.params[StreamCollection.KEY_OF_PATH]
    try:
        reader = stream.open_reader(client)
        of = fsspec.open(os.path.join(prefix, path), "wb", **storage_options)
    except Exception:
        report_exception()
        sys.exit(-1)

    with of as f:
        while True:
            try:
                chunk = reader.next()
            except (StopIteration, vineyard.StreamDrainedException):
                break
            f.write(bytes(chunk))
예제 #7
0
def parse_dataframe(vineyard_socket, stream_id, proc_num, proc_index):
    client = vineyard.connect(vineyard_socket)
    streams = client.get(stream_id)
    if len(streams) != proc_num or streams[proc_index] is None:
        raise ValueError(
            f"Fetch stream error with proc_num={proc_num},proc_index={proc_index}"
        )
    instream: DataframeStream = streams[proc_index]
    stream_reader = instream.open_reader(client)

    generate_header_row = instream.params.get("header_row", None) == "1"
    delimiter = instream.params.get("delimiter", ",")

    stream = ByteStream.new(client, params=instream.params)
    client.persist(stream.id)
    report_success(stream.id)

    stream_writer = stream.open_writer(client)
    first_write = generate_header_row

    try:
        while True:
            try:
                batch = stream_reader.next()  # pa.RecordBatch
            except (StopIteration, vineyard.StreamDrainedException):
                stream_writer.finish()
                break
            df = batch.to_pandas()
            csv_content = df.to_csv(header=first_write,
                                    index=False,
                                    sep=delimiter).encode('utf-8')

            # write to byte stream
            first_write = False
            chunk = stream_writer.next(len(csv_content))
            vineyard.memory_copy(chunk, 0, csv_content)
    except Exception:
        report_exception()
        stream_writer.fail()
        sys.exit(-1)
예제 #8
0
def read_bytes(  # noqa: C901
    vineyard_socket: str,
    path: str,
    storage_options: Dict,
    read_options: Dict,
    proc_num: int,
    proc_index: int,
):
    """Read bytes from external storage and produce a ByteStream,
    which will later be assembled into a ParallelStream.

    Args:
        vineyard_socket (str): Ipc socket
        path (str): External storage path to write to
        storage_options (dict): Configurations of external storage
        read_options (dict): Additional options that could control the behavior of read
        proc_num (int): Total amount of process
        proc_index (int): The sequence of this process

    Raises:
        ValueError: If the stream is invalid.
    """
    client = vineyard.connect(vineyard_socket)
    params = dict()

    read_block_delimiter = read_options.pop('read_block_delimiter', '\n')
    if read_block_delimiter is not None:
        read_block_delimiter = read_block_delimiter.encode('utf-8')

    # Used when reading tables from external storage.
    # Usually for load a property graph
    header_row = read_options.get("header_row", False)
    for k, v in read_options.items():
        if k in ("header_row", "include_all_columns"):
            params[k] = "1" if v else "0"
        elif k == "delimiter":
            params[k] = bytes(v, "utf-8").decode("unicode_escape")
        else:
            params[k] = v

    try:
        protocol = split_protocol(path)[0]
        fs = fsspec.filesystem(protocol, **storage_options)
    except Exception:
        report_error(
            f"Cannot initialize such filesystem for '{path}', "
            f"exception is:\n{traceback.format_exc()}"
        )
        sys.exit(-1)

    if fs.isfile(path):
        files = [path]
    else:
        try:
            files = fs.glob(path + '*')
            assert files, f"Cannot find such files: {path}"
        except Exception:
            report_error(f"Cannot find such files for '{path}'")
            sys.exit(-1)
    ''' Note [Semantic of read_block with delimiter]:

    read_block(fp, begin, size, delimiter) will:

        - find the first `delimiter` from `begin`, then starts read
        - after `size`, go through util the next `delimiter` or EOF, then finishes read.
            Note that the returned size may exceed `size`.
    '''

    stream, writer = None, None
    if 'chunk_size' in storage_options:
        chunk_size = parse_readable_size(storage_options['chunk_size'])
    else:
        chunk_size = 1024 * 1024 * 64  # default: 64MB

    try:
        for index, file_path in enumerate(files):
            with fs.open(file_path, mode="rb") as f:
                offset = 0
                # Only process header line when processing first file
                # And open the writer when processing first file
                if index == 0:
                    if header_row:
                        header_line = read_block(f, 0, 1, read_block_delimiter)
                        params["header_line"] = header_line.decode("unicode_escape")
                        offset = len(header_line)
                    stream = ByteStream.new(client, params)
                    client.persist(stream.id)
                    report_success(stream.id)
                    writer = stream.open_writer(client)

                try:
                    total_size = f.size()
                except TypeError:
                    total_size = f.size
                part_size = (total_size - offset) // proc_num
                begin = part_size * proc_index + offset
                end = min(begin + part_size, total_size)

                # See Note [Semantic of read_block with delimiter].
                if index == 0 and proc_index == 0:
                    begin -= int(header_row)

                while begin < end:
                    buffer = read_block(
                        f,
                        begin,
                        min(chunk_size, end - begin),
                        delimiter=read_block_delimiter,
                    )
                    size = len(buffer)
                    if size <= 0:
                        break
                    begin += size - 1
                    chunk = writer.next(size)
                    vineyard.memory_copy(chunk, 0, buffer)
        writer.finish()
    except Exception:
        report_exception()
        if writer is not None:
            writer.fail()
        sys.exit(-1)
예제 #9
0
def parse_bytes(vineyard_socket, stream_id, proc_num,
                proc_index):  # noqa: C901
    client = vineyard.connect(vineyard_socket)

    # get input streams
    streams = client.get(stream_id)
    if len(streams) != proc_num or streams[proc_index] is None:
        raise ValueError(
            f"Fetch stream error with proc_num={proc_num},proc_index={proc_index}"
        )
    instream: ByteStream = streams[proc_index]
    stream_reader = instream.open_reader(client)

    use_header_row = instream.params.get("header_row", None) == "1"
    delimiter = instream.params.get("delimiter", ",")

    # process parsing and coverting options

    columns = []
    column_types = []
    original_columns = []
    header_line = None

    if use_header_row:
        header_line: str = instream.params.get('header_line', None)
        if not header_line:
            report_error(
                'Header line not found while header_row is set to True')
            sys.exit(-1)
        original_columns = header_line.strip().split(delimiter)

    schema = instream.params.get('schema', None)
    if schema:
        columns = schema.split(',')

    column_types = instream.params.get('column_types', [])
    if column_types:
        column_types = column_types.split(',')

    include_all_columns = instream.params.get('include_all_columns',
                                              None) == '1'

    read_options = pa.csv.ReadOptions()
    parse_options = pa.csv.ParseOptions()
    convert_options = pa.csv.ConvertOptions()

    if original_columns:
        read_options.column_names = original_columns
    else:
        read_options.autogenerate_column_names = True
    parse_options.delimiter = delimiter

    indices = []
    for i, column in enumerate(columns):
        if original_columns:
            if column.isdigit():
                column_index = int(column)
                if column_index >= len(original_columns):
                    raise IndexError('Column index out of range: %s of %s' %
                                     (column_index, original_columns))
                indices.append(i)
                columns[i] = original_columns[column_index]
        else:
            columns[
                i] = 'f%s' % i  # arrow auto generates column names in that way.

    if include_all_columns:
        for column in original_columns:
            if column not in columns:
                columns.append(column)

    if columns:
        convert_options.include_columns = columns
    if len(column_types) > len(columns):
        raise ValueError(
            "Format of column type schema is incorrect: too many columns")

    arrow_column_types = dict()
    for i, column_type in enumerate(column_types):
        if column_type:
            arrow_column_types[columns[i]] = normalize_arrow_dtype(column_type)
    convert_options.column_types = arrow_column_types

    stream = DataframeStream.new(client, params=instream.params)
    client.persist(stream.id)
    report_success(stream.id)

    stream_writer = stream.open_writer(client)

    try:
        while True:
            try:
                content = stream_reader.next()
            except (StopIteration, vineyard.StreamDrainedException):
                stream_writer.finish()
                break

            # parse csv
            table = parse_dataframe_blocks(content, read_options,
                                           parse_options, convert_options)
            # write recordbatches
            stream_writer.write_table(table)
    except Exception:
        report_exception()
        stream_writer.fail()
        sys.exit(-1)