def read_vineyard_dataframe(vineyard_socket, path, storage_options, read_options, proc_num, proc_index): client = vineyard.connect(vineyard_socket) params = dict() if storage_options: raise ValueError("Read vineyard current not support storage options") params["header_row"] = "1" if read_options.get("header_row", False) else "0" params["delimiter"] = bytes(read_options.get("delimiter", ","), "utf-8").decode("unicode_escape") stream = DataframeStream.new(client, params) client.persist(stream.id) report_success(stream.id) name = urlparse(path).netloc # the "name" part in URL can be a name, or an ObjectID for convenience. try: df_id = client.get_name(name) except Exception: df_id = vineyard.ObjectID(name) dataframes = client.get(df_id) writer: DataframeStream.Writer = stream.open_writer(client) try: for df in dataframes: batch = pa.RecordBatch.from_pandas(df) writer.write(batch) writer.finish() except Exception: report_exception() writer.fail() sys.exit(-1)
def read_byte_stream( client, fs: AbstractFileSystem, stream: ByteStream, path: str, chunk_size: int = CHUNK_SIZE, ): logger.info('start reading blob at %s', path) with fs.open(path, mode="rb") as f: try: total_size = f.size() except TypeError: total_size = f.size writer = stream.open_writer(client) try: begin, end = 0, total_size while begin < end: buffer = read_block(f, begin, min(chunk_size, end - begin)) chunk = writer.next(len(buffer)) vineyard.memory_copy(chunk, 0, buffer) begin += len(buffer) except Exception: report_exception() writer.fail() sys.exit(-1) writer.finish() return total_size
def main(): if len(sys.argv) < 3: print("usage: ./serializer <ipc_socket> <object_id>") exit(1) ipc_socket = sys.argv[1] object_id = vineyard.ObjectID(sys.argv[2]) try: serialize(ipc_socket, object_id) except Exception: report_exception() sys.exit(-1)
def write_bytes( vineyard_socket, path, stream_id, storage_options, write_options, proc_num, proc_index, ): """Read bytes from stream and write to external storage. Args: vineyard_socket (str): Ipc socket path (str): External storage path to write to stream_id (str): ObjectID of the stream to be read from, which is a ParallelStream storage_options (dict): Configurations of external storage write_options (dict): Additional options that could control the behavior of write proc_num (int): Total amount of process proc_index (int): The sequence of this process Raises: ValueError: If the stream is invalid. """ client = vineyard.connect(vineyard_socket) streams = client.get(stream_id) if len(streams) != proc_num or streams[proc_index] is None: report_error( f"Fetch stream error with proc_num = {proc_num}, proc_index = {proc_index}" ) sys.exit(-1) instream: ByteStream = streams[proc_index] try: reader = instream.open_reader(client) of = fsspec.open(f"{path}_{proc_index}", "wb", **storage_options) except Exception: report_exception() sys.exit(-1) lengths = [] # store lengths of each chunk. may be unused with of as f: while True: try: chunk = reader.next() except (StopIteration, vineyard.StreamDrainedException): break lengths.append(len(chunk)) f.write(bytes(chunk))
def main(): if len(sys.argv) < 5: print( "usage: ./deserializer <ipc_socket> <object_id> <proc_num> <proc_index>" ) exit(1) ipc_socket = sys.argv[1] object_id = vineyard.ObjectID(sys.argv[2]) proc_num = int(sys.argv[3]) proc_index = int(sys.argv[4]) try: deserialize(ipc_socket, object_id, proc_num, proc_index) except Exception: report_exception() sys.exit(-1)
def write_byte_stream(client, stream: ByteStream, prefix: str, storage_options: Dict): path = stream.params[StreamCollection.KEY_OF_PATH] try: reader = stream.open_reader(client) of = fsspec.open(os.path.join(prefix, path), "wb", **storage_options) except Exception: report_exception() sys.exit(-1) with of as f: while True: try: chunk = reader.next() except (StopIteration, vineyard.StreamDrainedException): break f.write(bytes(chunk))
def parse_dataframe(vineyard_socket, stream_id, proc_num, proc_index): client = vineyard.connect(vineyard_socket) streams = client.get(stream_id) if len(streams) != proc_num or streams[proc_index] is None: raise ValueError( f"Fetch stream error with proc_num={proc_num},proc_index={proc_index}" ) instream: DataframeStream = streams[proc_index] stream_reader = instream.open_reader(client) generate_header_row = instream.params.get("header_row", None) == "1" delimiter = instream.params.get("delimiter", ",") stream = ByteStream.new(client, params=instream.params) client.persist(stream.id) report_success(stream.id) stream_writer = stream.open_writer(client) first_write = generate_header_row try: while True: try: batch = stream_reader.next() # pa.RecordBatch except (StopIteration, vineyard.StreamDrainedException): stream_writer.finish() break df = batch.to_pandas() csv_content = df.to_csv(header=first_write, index=False, sep=delimiter).encode('utf-8') # write to byte stream first_write = False chunk = stream_writer.next(len(csv_content)) vineyard.memory_copy(chunk, 0, csv_content) except Exception: report_exception() stream_writer.fail() sys.exit(-1)
def read_bytes( # noqa: C901 vineyard_socket: str, path: str, storage_options: Dict, read_options: Dict, proc_num: int, proc_index: int, ): """Read bytes from external storage and produce a ByteStream, which will later be assembled into a ParallelStream. Args: vineyard_socket (str): Ipc socket path (str): External storage path to write to storage_options (dict): Configurations of external storage read_options (dict): Additional options that could control the behavior of read proc_num (int): Total amount of process proc_index (int): The sequence of this process Raises: ValueError: If the stream is invalid. """ client = vineyard.connect(vineyard_socket) params = dict() read_block_delimiter = read_options.pop('read_block_delimiter', '\n') if read_block_delimiter is not None: read_block_delimiter = read_block_delimiter.encode('utf-8') # Used when reading tables from external storage. # Usually for load a property graph header_row = read_options.get("header_row", False) for k, v in read_options.items(): if k in ("header_row", "include_all_columns"): params[k] = "1" if v else "0" elif k == "delimiter": params[k] = bytes(v, "utf-8").decode("unicode_escape") else: params[k] = v try: protocol = split_protocol(path)[0] fs = fsspec.filesystem(protocol, **storage_options) except Exception: report_error( f"Cannot initialize such filesystem for '{path}', " f"exception is:\n{traceback.format_exc()}" ) sys.exit(-1) if fs.isfile(path): files = [path] else: try: files = fs.glob(path + '*') assert files, f"Cannot find such files: {path}" except Exception: report_error(f"Cannot find such files for '{path}'") sys.exit(-1) ''' Note [Semantic of read_block with delimiter]: read_block(fp, begin, size, delimiter) will: - find the first `delimiter` from `begin`, then starts read - after `size`, go through util the next `delimiter` or EOF, then finishes read. Note that the returned size may exceed `size`. ''' stream, writer = None, None if 'chunk_size' in storage_options: chunk_size = parse_readable_size(storage_options['chunk_size']) else: chunk_size = 1024 * 1024 * 64 # default: 64MB try: for index, file_path in enumerate(files): with fs.open(file_path, mode="rb") as f: offset = 0 # Only process header line when processing first file # And open the writer when processing first file if index == 0: if header_row: header_line = read_block(f, 0, 1, read_block_delimiter) params["header_line"] = header_line.decode("unicode_escape") offset = len(header_line) stream = ByteStream.new(client, params) client.persist(stream.id) report_success(stream.id) writer = stream.open_writer(client) try: total_size = f.size() except TypeError: total_size = f.size part_size = (total_size - offset) // proc_num begin = part_size * proc_index + offset end = min(begin + part_size, total_size) # See Note [Semantic of read_block with delimiter]. if index == 0 and proc_index == 0: begin -= int(header_row) while begin < end: buffer = read_block( f, begin, min(chunk_size, end - begin), delimiter=read_block_delimiter, ) size = len(buffer) if size <= 0: break begin += size - 1 chunk = writer.next(size) vineyard.memory_copy(chunk, 0, buffer) writer.finish() except Exception: report_exception() if writer is not None: writer.fail() sys.exit(-1)
def parse_bytes(vineyard_socket, stream_id, proc_num, proc_index): # noqa: C901 client = vineyard.connect(vineyard_socket) # get input streams streams = client.get(stream_id) if len(streams) != proc_num or streams[proc_index] is None: raise ValueError( f"Fetch stream error with proc_num={proc_num},proc_index={proc_index}" ) instream: ByteStream = streams[proc_index] stream_reader = instream.open_reader(client) use_header_row = instream.params.get("header_row", None) == "1" delimiter = instream.params.get("delimiter", ",") # process parsing and coverting options columns = [] column_types = [] original_columns = [] header_line = None if use_header_row: header_line: str = instream.params.get('header_line', None) if not header_line: report_error( 'Header line not found while header_row is set to True') sys.exit(-1) original_columns = header_line.strip().split(delimiter) schema = instream.params.get('schema', None) if schema: columns = schema.split(',') column_types = instream.params.get('column_types', []) if column_types: column_types = column_types.split(',') include_all_columns = instream.params.get('include_all_columns', None) == '1' read_options = pa.csv.ReadOptions() parse_options = pa.csv.ParseOptions() convert_options = pa.csv.ConvertOptions() if original_columns: read_options.column_names = original_columns else: read_options.autogenerate_column_names = True parse_options.delimiter = delimiter indices = [] for i, column in enumerate(columns): if original_columns: if column.isdigit(): column_index = int(column) if column_index >= len(original_columns): raise IndexError('Column index out of range: %s of %s' % (column_index, original_columns)) indices.append(i) columns[i] = original_columns[column_index] else: columns[ i] = 'f%s' % i # arrow auto generates column names in that way. if include_all_columns: for column in original_columns: if column not in columns: columns.append(column) if columns: convert_options.include_columns = columns if len(column_types) > len(columns): raise ValueError( "Format of column type schema is incorrect: too many columns") arrow_column_types = dict() for i, column_type in enumerate(column_types): if column_type: arrow_column_types[columns[i]] = normalize_arrow_dtype(column_type) convert_options.column_types = arrow_column_types stream = DataframeStream.new(client, params=instream.params) client.persist(stream.id) report_success(stream.id) stream_writer = stream.open_writer(client) try: while True: try: content = stream_reader.next() except (StopIteration, vineyard.StreamDrainedException): stream_writer.finish() break # parse csv table = parse_dataframe_blocks(content, read_options, parse_options, convert_options) # write recordbatches stream_writer.write_table(table) except Exception: report_exception() stream_writer.fail() sys.exit(-1)