def serialize(vineyard_socket, object_id): '''Serialize a vineyard object as a stream. The serialization executes in the following steps: 1. glob all blobs in the meta 2. build a stream for each blob 3. generate a hierarchical `StreamCollection` object as the result ''' client = vineyard.connect(vineyard_socket) meta = client.get_meta(object_id) queue: "ConcurrentQueue[Tuple[ByteStream, memoryview]]" = ConcurrentQueue() serialized_id = traverse_to_serialize(client, meta, queue, '') # object id done client.persist(serialized_id) report_success(serialized_id) # start transfer # # easy to be implemented as a threaded executor in a future executor = ThreadStreamExecutor(SerializeExecutor, parallism=1, task_queue=queue) results = executor.execute() logger.info('finish serialization: %s', results)
def deserialize(vineyard_socket, object_id, proc_num, proc_index): client = vineyard.connect(vineyard_socket) streams = client.get(object_id) if len(streams) != proc_num: report_error("Expected: %s stream partitions" % proc_num) sys.exit(-1) queue: "ConcurrentQueue[Tuple[ByteStream, Union[BlobBuilder, Blob]]]" = ( ConcurrentQueue()) traverse_to_prepare(client, streams[proc_index].id, queue) # serve as a stream id -> blob id mapping rqueue: "ConcurrentQueue[Tuple[ObjectID, str, Blob]]" = ConcurrentQueue() # copy blobs executor = ThreadStreamExecutor( ReconstructExecututor, parallism=1, client=client, task_queue=queue, result_queue=rqueue, ) executor.execute() blobs: Dict[ObjectID, Blob] = dict() while not rqueue.empty(): bs, memberpath, blob = rqueue.get(block=False) blobs[bs] = (memberpath, blob) _, result = traverse_to_rebuild(client, streams[proc_index].id, blobs) client.persist(result.id) report_success(result.id)
def read_vineyard_dataframe(vineyard_socket, path, storage_options, read_options, proc_num, proc_index): client = vineyard.connect(vineyard_socket) params = dict() if storage_options: raise ValueError("Read vineyard current not support storage options") params["header_row"] = "1" if read_options.get("header_row", False) else "0" params["delimiter"] = bytes(read_options.get("delimiter", ","), "utf-8").decode("unicode_escape") stream = DataframeStream.new(client, params) client.persist(stream.id) report_success(stream.id) name = urlparse(path).netloc # the "name" part in URL can be a name, or an ObjectID for convenience. try: df_id = client.get_name(name) except Exception: df_id = vineyard.ObjectID(name) dataframes = client.get(df_id) writer: DataframeStream.Writer = stream.open_writer(client) try: for df in dataframes: batch = pa.RecordBatch.from_pandas(df) writer.write(batch) writer.finish() except Exception: report_exception() writer.fail() sys.exit(-1)
def read_bytes_collection( vineyard_socket, prefix, storage_options, proc_num, proc_index ): """Read a set of files as a collection of ByteStreams.""" client = vineyard.connect(vineyard_socket) protocol, prefix_path = split_protocol(prefix) fs = fsspec.filesystem(protocol, **storage_options) worker_prefix = os.path.join(prefix_path, '%s-%s' % (proc_num, proc_index)) logger.info("start creating blobs ...") queue: "ConcurrentQueue[Tuple[ByteStream, str]]" = ConcurrentQueue() stream_id = read_stream_collections(client, fs, queue, worker_prefix, worker_prefix) client.persist(stream_id) report_success(stream_id) logger.info("start reading blobs ...") executor = ThreadStreamExecutor( ReadToByteStreamExecutor, parallism=1, client=client, fs=fs, task_queue=queue, chunk_size=CHUNK_SIZE, ) executor.execute()
def read_orc( vineyard_socket, path, storage_options: Dict, read_options: Dict, proc_num, proc_index, ): # This method is to read the data files of a specific hive table # that is stored as orc format in HDFS. # # In general, the data files of a hive table are stored at the hive # space in the HDFS with the table name as the directory, # e.g., # # .. code:: python # # '/user/hive/warehouse/sometable' # # To read the entire table, simply use 'hive://user/hive/warehouse/sometable' # as the path. # # In case the table is partitioned, use the sub-directory of a specific partition # to read only the data from that partition. For example, sometable is partitioned # by column date, we can read the data in a given date by giving path as # # .. code:: python # # 'hive://user/hive/warehouse/sometable/date=20201112' # if proc_index: raise ValueError("Parallel reading ORC hasn't been supported yet") if read_options: raise ValueError("Reading ORC doesn't support read options.") client = vineyard.connect(vineyard_socket) stream = DataframeStream.new(client) client.persist(stream.id) report_success(stream.id) writer = stream.open_writer(client) parsed = urlparse(path) fs = fsspec.filesystem(parsed.scheme, **storage_options) if fs.isfile(parsed.path): files = [parsed.path] else: files = [f for f in fs.ls(parsed.path, detail=False) if fs.isfile(f)] for file_path in files: read_single_orc(file_path, fs, writer) # hdfs = HDFileSystem( # host=host, port=int(port), pars={"dfs.client.read.shortcircuit": "false"} # ) writer.finish()
def parse_dataframe(vineyard_socket, stream_id, proc_num, proc_index): client = vineyard.connect(vineyard_socket) streams = client.get(stream_id) if len(streams) != proc_num or streams[proc_index] is None: raise ValueError( f"Fetch stream error with proc_num={proc_num},proc_index={proc_index}" ) instream: DataframeStream = streams[proc_index] stream_reader = instream.open_reader(client) generate_header_row = instream.params.get("header_row", None) == "1" delimiter = instream.params.get("delimiter", ",") stream = ByteStream.new(client, params=instream.params) client.persist(stream.id) report_success(stream.id) stream_writer = stream.open_writer(client) first_write = generate_header_row try: while True: try: batch = stream_reader.next() # pa.RecordBatch except (StopIteration, vineyard.StreamDrainedException): stream_writer.finish() break df = batch.to_pandas() csv_content = df.to_csv(header=first_write, index=False, sep=delimiter).encode('utf-8') # write to byte stream first_write = False chunk = stream_writer.next(len(csv_content)) vineyard.memory_copy(chunk, 0, csv_content) except Exception: report_exception() stream_writer.fail() sys.exit(-1)
def write_vineyard_dataframe(vineyard_socket, stream_id, proc_num, proc_index): client = vineyard.connect(vineyard_socket) streams = client.get(stream_id) if len(streams) != proc_num or streams[proc_index] is None: raise ValueError( f"Fetch stream error with proc_num={proc_num},proc_index={proc_index}" ) instream: DataframeStream = streams[proc_index] stream_reader = instream.open_reader(client) batch_index = 0 while True: try: batch = stream_reader.next() except Exception: break df = batch.to_pandas() df_id = client.put(df, partition_index=[proc_index, 0], row_batch_index=batch_index) batch_index += 1 client.persist(df_id) report_success(df_id)
def read_bytes( # noqa: C901 vineyard_socket: str, path: str, storage_options: Dict, read_options: Dict, proc_num: int, proc_index: int, ): """Read bytes from external storage and produce a ByteStream, which will later be assembled into a ParallelStream. Args: vineyard_socket (str): Ipc socket path (str): External storage path to write to storage_options (dict): Configurations of external storage read_options (dict): Additional options that could control the behavior of read proc_num (int): Total amount of process proc_index (int): The sequence of this process Raises: ValueError: If the stream is invalid. """ client = vineyard.connect(vineyard_socket) params = dict() read_block_delimiter = read_options.pop('read_block_delimiter', '\n') if read_block_delimiter is not None: read_block_delimiter = read_block_delimiter.encode('utf-8') # Used when reading tables from external storage. # Usually for load a property graph header_row = read_options.get("header_row", False) for k, v in read_options.items(): if k in ("header_row", "include_all_columns"): params[k] = "1" if v else "0" elif k == "delimiter": params[k] = bytes(v, "utf-8").decode("unicode_escape") else: params[k] = v try: protocol = split_protocol(path)[0] fs = fsspec.filesystem(protocol, **storage_options) except Exception: report_error( f"Cannot initialize such filesystem for '{path}', " f"exception is:\n{traceback.format_exc()}" ) sys.exit(-1) if fs.isfile(path): files = [path] else: try: files = fs.glob(path + '*') assert files, f"Cannot find such files: {path}" except Exception: report_error(f"Cannot find such files for '{path}'") sys.exit(-1) ''' Note [Semantic of read_block with delimiter]: read_block(fp, begin, size, delimiter) will: - find the first `delimiter` from `begin`, then starts read - after `size`, go through util the next `delimiter` or EOF, then finishes read. Note that the returned size may exceed `size`. ''' stream, writer = None, None if 'chunk_size' in storage_options: chunk_size = parse_readable_size(storage_options['chunk_size']) else: chunk_size = 1024 * 1024 * 64 # default: 64MB try: for index, file_path in enumerate(files): with fs.open(file_path, mode="rb") as f: offset = 0 # Only process header line when processing first file # And open the writer when processing first file if index == 0: if header_row: header_line = read_block(f, 0, 1, read_block_delimiter) params["header_line"] = header_line.decode("unicode_escape") offset = len(header_line) stream = ByteStream.new(client, params) client.persist(stream.id) report_success(stream.id) writer = stream.open_writer(client) try: total_size = f.size() except TypeError: total_size = f.size part_size = (total_size - offset) // proc_num begin = part_size * proc_index + offset end = min(begin + part_size, total_size) # See Note [Semantic of read_block with delimiter]. if index == 0 and proc_index == 0: begin -= int(header_row) while begin < end: buffer = read_block( f, begin, min(chunk_size, end - begin), delimiter=read_block_delimiter, ) size = len(buffer) if size <= 0: break begin += size - 1 chunk = writer.next(size) vineyard.memory_copy(chunk, 0, buffer) writer.finish() except Exception: report_exception() if writer is not None: writer.fail() sys.exit(-1)
def parse_bytes(vineyard_socket, stream_id, proc_num, proc_index): # noqa: C901 client = vineyard.connect(vineyard_socket) # get input streams streams = client.get(stream_id) if len(streams) != proc_num or streams[proc_index] is None: raise ValueError( f"Fetch stream error with proc_num={proc_num},proc_index={proc_index}" ) instream: ByteStream = streams[proc_index] stream_reader = instream.open_reader(client) use_header_row = instream.params.get("header_row", None) == "1" delimiter = instream.params.get("delimiter", ",") # process parsing and coverting options columns = [] column_types = [] original_columns = [] header_line = None if use_header_row: header_line: str = instream.params.get('header_line', None) if not header_line: report_error( 'Header line not found while header_row is set to True') sys.exit(-1) original_columns = header_line.strip().split(delimiter) schema = instream.params.get('schema', None) if schema: columns = schema.split(',') column_types = instream.params.get('column_types', []) if column_types: column_types = column_types.split(',') include_all_columns = instream.params.get('include_all_columns', None) == '1' read_options = pa.csv.ReadOptions() parse_options = pa.csv.ParseOptions() convert_options = pa.csv.ConvertOptions() if original_columns: read_options.column_names = original_columns else: read_options.autogenerate_column_names = True parse_options.delimiter = delimiter indices = [] for i, column in enumerate(columns): if original_columns: if column.isdigit(): column_index = int(column) if column_index >= len(original_columns): raise IndexError('Column index out of range: %s of %s' % (column_index, original_columns)) indices.append(i) columns[i] = original_columns[column_index] else: columns[ i] = 'f%s' % i # arrow auto generates column names in that way. if include_all_columns: for column in original_columns: if column not in columns: columns.append(column) if columns: convert_options.include_columns = columns if len(column_types) > len(columns): raise ValueError( "Format of column type schema is incorrect: too many columns") arrow_column_types = dict() for i, column_type in enumerate(column_types): if column_type: arrow_column_types[columns[i]] = normalize_arrow_dtype(column_type) convert_options.column_types = arrow_column_types stream = DataframeStream.new(client, params=instream.params) client.persist(stream.id) report_success(stream.id) stream_writer = stream.open_writer(client) try: while True: try: content = stream_reader.next() except (StopIteration, vineyard.StreamDrainedException): stream_writer.finish() break # parse csv table = parse_dataframe_blocks(content, read_options, parse_options, convert_options) # write recordbatches stream_writer.write_table(table) except Exception: report_exception() stream_writer.fail() sys.exit(-1)