Exemplo n.º 1
0
def serialize(vineyard_socket, object_id):
    '''Serialize a vineyard object as a stream.

    The serialization executes in the following steps:

    1. glob all blobs in the meta

    2. build a stream for each blob

    3. generate a hierarchical `StreamCollection` object as the result
    '''
    client = vineyard.connect(vineyard_socket)
    meta = client.get_meta(object_id)

    queue: "ConcurrentQueue[Tuple[ByteStream, memoryview]]" = ConcurrentQueue()
    serialized_id = traverse_to_serialize(client, meta, queue, '')

    # object id done
    client.persist(serialized_id)
    report_success(serialized_id)

    # start transfer
    #
    # easy to be implemented as a threaded executor in a future
    executor = ThreadStreamExecutor(SerializeExecutor,
                                    parallism=1,
                                    task_queue=queue)
    results = executor.execute()
    logger.info('finish serialization: %s', results)
Exemplo n.º 2
0
def deserialize(vineyard_socket, object_id, proc_num, proc_index):
    client = vineyard.connect(vineyard_socket)
    streams = client.get(object_id)

    if len(streams) != proc_num:
        report_error("Expected: %s stream partitions" % proc_num)
        sys.exit(-1)

    queue: "ConcurrentQueue[Tuple[ByteStream, Union[BlobBuilder, Blob]]]" = (
        ConcurrentQueue())
    traverse_to_prepare(client, streams[proc_index].id, queue)

    # serve as a stream id -> blob id mapping
    rqueue: "ConcurrentQueue[Tuple[ObjectID, str, Blob]]" = ConcurrentQueue()

    # copy blobs
    executor = ThreadStreamExecutor(
        ReconstructExecututor,
        parallism=1,
        client=client,
        task_queue=queue,
        result_queue=rqueue,
    )
    executor.execute()

    blobs: Dict[ObjectID, Blob] = dict()
    while not rqueue.empty():
        bs, memberpath, blob = rqueue.get(block=False)
        blobs[bs] = (memberpath, blob)

    _, result = traverse_to_rebuild(client, streams[proc_index].id, blobs)
    client.persist(result.id)
    report_success(result.id)
Exemplo n.º 3
0
def read_vineyard_dataframe(vineyard_socket, path, storage_options,
                            read_options, proc_num, proc_index):
    client = vineyard.connect(vineyard_socket)
    params = dict()
    if storage_options:
        raise ValueError("Read vineyard current not support storage options")
    params["header_row"] = "1" if read_options.get("header_row",
                                                   False) else "0"
    params["delimiter"] = bytes(read_options.get("delimiter", ","),
                                "utf-8").decode("unicode_escape")

    stream = DataframeStream.new(client, params)
    client.persist(stream.id)
    report_success(stream.id)

    name = urlparse(path).netloc
    # the "name" part in URL can be a name, or an ObjectID for convenience.
    try:
        df_id = client.get_name(name)
    except Exception:
        df_id = vineyard.ObjectID(name)
    dataframes = client.get(df_id)

    writer: DataframeStream.Writer = stream.open_writer(client)

    try:
        for df in dataframes:
            batch = pa.RecordBatch.from_pandas(df)
            writer.write(batch)
        writer.finish()
    except Exception:
        report_exception()
        writer.fail()
        sys.exit(-1)
Exemplo n.º 4
0
def read_bytes_collection(
    vineyard_socket, prefix, storage_options, proc_num, proc_index
):
    """Read a set of files as a collection of ByteStreams."""
    client = vineyard.connect(vineyard_socket)

    protocol, prefix_path = split_protocol(prefix)
    fs = fsspec.filesystem(protocol, **storage_options)

    worker_prefix = os.path.join(prefix_path, '%s-%s' % (proc_num, proc_index))

    logger.info("start creating blobs ...")
    queue: "ConcurrentQueue[Tuple[ByteStream, str]]" = ConcurrentQueue()
    stream_id = read_stream_collections(client, fs, queue, worker_prefix, worker_prefix)

    client.persist(stream_id)
    report_success(stream_id)

    logger.info("start reading blobs ...")
    executor = ThreadStreamExecutor(
        ReadToByteStreamExecutor,
        parallism=1,
        client=client,
        fs=fs,
        task_queue=queue,
        chunk_size=CHUNK_SIZE,
    )
    executor.execute()
Exemplo n.º 5
0
def read_orc(
    vineyard_socket,
    path,
    storage_options: Dict,
    read_options: Dict,
    proc_num,
    proc_index,
):
    # This method is to read the data files of a specific hive table
    # that is stored as orc format in HDFS.
    #
    # In general, the data files of a hive table are stored at the hive
    # space in the HDFS with the table name as the directory,
    # e.g.,
    #
    # .. code:: python
    #
    #    '/user/hive/warehouse/sometable'
    #
    # To read the entire table, simply use 'hive://user/hive/warehouse/sometable'
    # as the path.
    #
    # In case the table is partitioned, use the sub-directory of a specific partition
    # to read only the data from that partition. For example, sometable is partitioned
    # by column date, we can read the data in a given date by giving path as
    #
    # .. code:: python
    #
    #    'hive://user/hive/warehouse/sometable/date=20201112'
    #
    if proc_index:
        raise ValueError("Parallel reading ORC hasn't been supported yet")
    if read_options:
        raise ValueError("Reading ORC doesn't support read options.")
    client = vineyard.connect(vineyard_socket)
    stream = DataframeStream.new(client)
    client.persist(stream.id)
    report_success(stream.id)

    writer = stream.open_writer(client)
    parsed = urlparse(path)

    fs = fsspec.filesystem(parsed.scheme, **storage_options)
    if fs.isfile(parsed.path):
        files = [parsed.path]
    else:
        files = [f for f in fs.ls(parsed.path, detail=False) if fs.isfile(f)]
    for file_path in files:
        read_single_orc(file_path, fs, writer)
    # hdfs = HDFileSystem(
    #     host=host, port=int(port), pars={"dfs.client.read.shortcircuit": "false"}
    # )

    writer.finish()
Exemplo n.º 6
0
def parse_dataframe(vineyard_socket, stream_id, proc_num, proc_index):
    client = vineyard.connect(vineyard_socket)
    streams = client.get(stream_id)
    if len(streams) != proc_num or streams[proc_index] is None:
        raise ValueError(
            f"Fetch stream error with proc_num={proc_num},proc_index={proc_index}"
        )
    instream: DataframeStream = streams[proc_index]
    stream_reader = instream.open_reader(client)

    generate_header_row = instream.params.get("header_row", None) == "1"
    delimiter = instream.params.get("delimiter", ",")

    stream = ByteStream.new(client, params=instream.params)
    client.persist(stream.id)
    report_success(stream.id)

    stream_writer = stream.open_writer(client)
    first_write = generate_header_row

    try:
        while True:
            try:
                batch = stream_reader.next()  # pa.RecordBatch
            except (StopIteration, vineyard.StreamDrainedException):
                stream_writer.finish()
                break
            df = batch.to_pandas()
            csv_content = df.to_csv(header=first_write,
                                    index=False,
                                    sep=delimiter).encode('utf-8')

            # write to byte stream
            first_write = False
            chunk = stream_writer.next(len(csv_content))
            vineyard.memory_copy(chunk, 0, csv_content)
    except Exception:
        report_exception()
        stream_writer.fail()
        sys.exit(-1)
Exemplo n.º 7
0
def write_vineyard_dataframe(vineyard_socket, stream_id, proc_num, proc_index):
    client = vineyard.connect(vineyard_socket)
    streams = client.get(stream_id)
    if len(streams) != proc_num or streams[proc_index] is None:
        raise ValueError(
            f"Fetch stream error with proc_num={proc_num},proc_index={proc_index}"
        )
    instream: DataframeStream = streams[proc_index]
    stream_reader = instream.open_reader(client)

    batch_index = 0
    while True:
        try:
            batch = stream_reader.next()
        except Exception:
            break
        df = batch.to_pandas()
        df_id = client.put(df,
                           partition_index=[proc_index, 0],
                           row_batch_index=batch_index)
        batch_index += 1
        client.persist(df_id)
        report_success(df_id)
Exemplo n.º 8
0
def read_bytes(  # noqa: C901
    vineyard_socket: str,
    path: str,
    storage_options: Dict,
    read_options: Dict,
    proc_num: int,
    proc_index: int,
):
    """Read bytes from external storage and produce a ByteStream,
    which will later be assembled into a ParallelStream.

    Args:
        vineyard_socket (str): Ipc socket
        path (str): External storage path to write to
        storage_options (dict): Configurations of external storage
        read_options (dict): Additional options that could control the behavior of read
        proc_num (int): Total amount of process
        proc_index (int): The sequence of this process

    Raises:
        ValueError: If the stream is invalid.
    """
    client = vineyard.connect(vineyard_socket)
    params = dict()

    read_block_delimiter = read_options.pop('read_block_delimiter', '\n')
    if read_block_delimiter is not None:
        read_block_delimiter = read_block_delimiter.encode('utf-8')

    # Used when reading tables from external storage.
    # Usually for load a property graph
    header_row = read_options.get("header_row", False)
    for k, v in read_options.items():
        if k in ("header_row", "include_all_columns"):
            params[k] = "1" if v else "0"
        elif k == "delimiter":
            params[k] = bytes(v, "utf-8").decode("unicode_escape")
        else:
            params[k] = v

    try:
        protocol = split_protocol(path)[0]
        fs = fsspec.filesystem(protocol, **storage_options)
    except Exception:
        report_error(
            f"Cannot initialize such filesystem for '{path}', "
            f"exception is:\n{traceback.format_exc()}"
        )
        sys.exit(-1)

    if fs.isfile(path):
        files = [path]
    else:
        try:
            files = fs.glob(path + '*')
            assert files, f"Cannot find such files: {path}"
        except Exception:
            report_error(f"Cannot find such files for '{path}'")
            sys.exit(-1)
    ''' Note [Semantic of read_block with delimiter]:

    read_block(fp, begin, size, delimiter) will:

        - find the first `delimiter` from `begin`, then starts read
        - after `size`, go through util the next `delimiter` or EOF, then finishes read.
            Note that the returned size may exceed `size`.
    '''

    stream, writer = None, None
    if 'chunk_size' in storage_options:
        chunk_size = parse_readable_size(storage_options['chunk_size'])
    else:
        chunk_size = 1024 * 1024 * 64  # default: 64MB

    try:
        for index, file_path in enumerate(files):
            with fs.open(file_path, mode="rb") as f:
                offset = 0
                # Only process header line when processing first file
                # And open the writer when processing first file
                if index == 0:
                    if header_row:
                        header_line = read_block(f, 0, 1, read_block_delimiter)
                        params["header_line"] = header_line.decode("unicode_escape")
                        offset = len(header_line)
                    stream = ByteStream.new(client, params)
                    client.persist(stream.id)
                    report_success(stream.id)
                    writer = stream.open_writer(client)

                try:
                    total_size = f.size()
                except TypeError:
                    total_size = f.size
                part_size = (total_size - offset) // proc_num
                begin = part_size * proc_index + offset
                end = min(begin + part_size, total_size)

                # See Note [Semantic of read_block with delimiter].
                if index == 0 and proc_index == 0:
                    begin -= int(header_row)

                while begin < end:
                    buffer = read_block(
                        f,
                        begin,
                        min(chunk_size, end - begin),
                        delimiter=read_block_delimiter,
                    )
                    size = len(buffer)
                    if size <= 0:
                        break
                    begin += size - 1
                    chunk = writer.next(size)
                    vineyard.memory_copy(chunk, 0, buffer)
        writer.finish()
    except Exception:
        report_exception()
        if writer is not None:
            writer.fail()
        sys.exit(-1)
Exemplo n.º 9
0
def parse_bytes(vineyard_socket, stream_id, proc_num,
                proc_index):  # noqa: C901
    client = vineyard.connect(vineyard_socket)

    # get input streams
    streams = client.get(stream_id)
    if len(streams) != proc_num or streams[proc_index] is None:
        raise ValueError(
            f"Fetch stream error with proc_num={proc_num},proc_index={proc_index}"
        )
    instream: ByteStream = streams[proc_index]
    stream_reader = instream.open_reader(client)

    use_header_row = instream.params.get("header_row", None) == "1"
    delimiter = instream.params.get("delimiter", ",")

    # process parsing and coverting options

    columns = []
    column_types = []
    original_columns = []
    header_line = None

    if use_header_row:
        header_line: str = instream.params.get('header_line', None)
        if not header_line:
            report_error(
                'Header line not found while header_row is set to True')
            sys.exit(-1)
        original_columns = header_line.strip().split(delimiter)

    schema = instream.params.get('schema', None)
    if schema:
        columns = schema.split(',')

    column_types = instream.params.get('column_types', [])
    if column_types:
        column_types = column_types.split(',')

    include_all_columns = instream.params.get('include_all_columns',
                                              None) == '1'

    read_options = pa.csv.ReadOptions()
    parse_options = pa.csv.ParseOptions()
    convert_options = pa.csv.ConvertOptions()

    if original_columns:
        read_options.column_names = original_columns
    else:
        read_options.autogenerate_column_names = True
    parse_options.delimiter = delimiter

    indices = []
    for i, column in enumerate(columns):
        if original_columns:
            if column.isdigit():
                column_index = int(column)
                if column_index >= len(original_columns):
                    raise IndexError('Column index out of range: %s of %s' %
                                     (column_index, original_columns))
                indices.append(i)
                columns[i] = original_columns[column_index]
        else:
            columns[
                i] = 'f%s' % i  # arrow auto generates column names in that way.

    if include_all_columns:
        for column in original_columns:
            if column not in columns:
                columns.append(column)

    if columns:
        convert_options.include_columns = columns
    if len(column_types) > len(columns):
        raise ValueError(
            "Format of column type schema is incorrect: too many columns")

    arrow_column_types = dict()
    for i, column_type in enumerate(column_types):
        if column_type:
            arrow_column_types[columns[i]] = normalize_arrow_dtype(column_type)
    convert_options.column_types = arrow_column_types

    stream = DataframeStream.new(client, params=instream.params)
    client.persist(stream.id)
    report_success(stream.id)

    stream_writer = stream.open_writer(client)

    try:
        while True:
            try:
                content = stream_reader.next()
            except (StopIteration, vineyard.StreamDrainedException):
                stream_writer.finish()
                break

            # parse csv
            table = parse_dataframe_blocks(content, read_options,
                                           parse_options, convert_options)
            # write recordbatches
            stream_writer.write_table(table)
    except Exception:
        report_exception()
        stream_writer.fail()
        sys.exit(-1)