示例#1
0
def pandas_index_builder(client, value, builder, **kw):
    meta = ObjectMeta()
    meta['typename'] = 'vineyard::Index'
    meta['name'] = to_json(value.name)
    meta['value_type_'] = value.dtype.name
    meta.add_member('value_', builder.run(client, value.to_numpy(), **kw))
    return client.create_metadata(meta)
示例#2
0
def schema_proxy_builder(client, schema, builder):
    meta = ObjectMeta()
    meta['typename'] = 'vineyard::SchemaProxy'
    serialized = schema.serialize()
    meta.add_member('buffer_', buffer_builder(client, serialized, builder))
    meta['nbytes'] = len(serialized)
    return client.create_metadata(meta)
示例#3
0
def traverse_to_serialize(
    client,
    meta: ObjectMeta,
    queue: "ConcurrentQueue[Tuple[ByteStream, memoryview]]",
    path: str,
) -> ObjectID:
    '''Returns:
    The generated stream or stream collection id.
    '''
    if meta.typename == 'vineyard::Blob':
        s = build_a_stream(client, meta, os.path.join(path, 'blob'))
        blob = meta.get_buffer(meta.id)
        queue.put((s, blob))
        return s.id
    else:
        metadata, streams = dict(), []
        metadata[StreamCollection.KEY_OF_GLOBAL] = meta.isglobal
        for k, v in meta.items():
            if k == 'typename':
                metadata['__typename'] = v
            elif isinstance(v, ObjectMeta):
                if v.islocal:
                    streams.append(
                        traverse_to_serialize(client, v, queue,
                                              os.path.join(path, k)))
            else:
                metadata[k] = v
        metadata[StreamCollection.KEY_OF_PATH] = path
        collection = StreamCollection.new(client, metadata, streams)
        return collection.id
示例#4
0
def numpy_ndarray_builder(client, value, **kw):
    meta = ObjectMeta()
    meta['typename'] = 'vineyard::Tensor<%s>' % value.dtype.name
    meta['value_type_'] = value.dtype.name
    meta['shape_'] = json.dumps(value.shape)
    meta['partition_index_'] = json.dumps(kw.get('partition_index', []))
    meta['nbytes'] = value.nbytes
    meta.add_member('buffer_', build_numpy_buffer(client, value))
    return client.create_metadata(meta)
示例#5
0
def numpy_ndarray_builder(client, value, **kw):
    meta = ObjectMeta()
    meta['typename'] = 'vineyard::Tensor<%s>' % value.dtype.name
    meta['value_type_'] = value.dtype.name
    meta['value_type_meta_'] = value.dtype.str
    meta['shape_'] = to_json(value.shape)
    meta['partition_index_'] = to_json(kw.get('partition_index', []))
    meta['nbytes'] = value.nbytes
    meta['order_'] = to_json(('C' if value.flags['C_CONTIGUOUS'] else 'F'))
    meta.add_member('buffer_', build_numpy_buffer(client, value))
    return client.create_metadata(meta)
示例#6
0
def default_builder(client, value, **kwargs):
    '''Default builder: pickle (version 5), then build a blob object for it.'''
    payload = pickle.dumps(value, protocol=5)
    buffer = client.create_blob(len(payload))
    buffer.copy(0, payload)

    meta = ObjectMeta(**kwargs)
    meta['typename'] = 'vineyard::PickleBuffer'
    meta['nbytes'] = len(payload)
    meta['size_'] = len(payload)
    meta.add_member('buffer_', buffer.seal(client))
    return client.create_metadata(meta)
示例#7
0
def pandas_dataframe_builder(client, value, builder, **kw):
    meta = ObjectMeta()
    meta['typename'] = 'vineyard::DataFrame'
    meta['columns_'] = json.dumps([str(x) for x in value.columns])
    for i, (name, column_value) in enumerate(value.iteritems()):
        np_value = column_value.to_numpy(copy=False)
        meta['__values_-key-%d' % i] = str(name)
        meta.add_member('__values_-value-%d' % i,
                        builder.run(client, np_value))
    meta['nbytes'] = 0  # FIXME
    meta['partition_index_row_'] = kw.get('partition_index', [0, 0])[0]
    meta['partition_index_column_'] = kw.get('partition_index', [0, 0])[1]
    return client.create_metadata(meta)
示例#8
0
def schema_proxy_builder(client, schema, builder):
    meta = ObjectMeta()
    meta['typename'] = 'vineyard::SchemaProxy'

    # translate pa.StringArray, pa.ListArray, etc.
    names = schema.names
    types = [_resize_arrow_type(t) for t in schema.types]
    fields = [pa.field(name, t) for name, t in zip(names, types)]
    resized_schema = pa.schema(fields, schema.metadata)

    serialized = resized_schema.serialize()
    meta.add_member('buffer_', buffer_builder(client, serialized, builder))
    meta['nbytes'] = len(serialized)
    return client.create_metadata(meta)
示例#9
0
def string_builder(client, value, **kwargs):
    meta = ObjectMeta(**kwargs)
    meta['typename'] = 'vineyard::Scalar<std::string>'
    meta['value_'] = value
    meta['type_'] = getattr(type(value), '__name__')
    meta['nbytes'] = 0
    return client.create_metadata(meta)
示例#10
0
def double_builder(client, value):
    meta = ObjectMeta()
    meta['typename'] = 'vineyard::Scalar<double>'
    meta['value_'] = value
    meta['type_'] = getattr(type(value), '__name__')
    meta['nbytes'] = 0
    return client.create_metadata(meta)
示例#11
0
def tuple_builder(client, value, builder):
    if len(value) == 2:
        # use pair
        meta = ObjectMeta()
        meta['typename'] = 'vineyard::Pair'
        meta.add_member('first_', builder.run(client, value[0]))
        meta.add_member('second_', builder.run(client, value[1]))
        return client.create_metadata(meta)
    else:
        meta = ObjectMeta()
        meta['typename'] = 'vineyard::Tuple'
        meta['size_'] = len(value)
        for i, item in enumerate(value):
            meta.add_member('__elements_-%d' % i, builder.run(client, item))
        meta['__elements_-size'] = len(value)
        return client.create_metadata(meta)
示例#12
0
def string_builder(client, value):
    meta = ObjectMeta()
    meta[
        'typename'] = 'vineyard::Scalar<std::basic_string<char,std::char_traits<char>,std::allocator<char>>>'
    meta['value_'] = value
    meta['type_'] = getattr(type(value), '__name__')
    meta['nbytes'] = 0
    return client.create_metadata(meta)
示例#13
0
def csc_matrix_builder(client, value, builder, **kw):
    meta = ObjectMeta()
    meta['typename'] = 'vineyard::CSCMatrix<%s>' % value.dtype.name
    meta['value_type_'] = value.dtype.name
    meta['shape_'] = to_json(value.shape)
    meta['ndim'] = value.ndim
    meta['nnz'] = value.nnz
    meta.add_member('data', builder.run(client, value.data, **kw))
    meta.add_member('indices', builder.run(client, value.indices, **kw))
    meta.add_member('indptr', builder.run(client, value.indptr, **kw))
    meta['partition_index_'] = to_json(kw.get('partition_index', []))
    meta['nbytes'] = value.nnz * value.dtype.itemsize
    return client.create_metadata(meta)
示例#14
0
def torch_dataframe_builder(client, value, builder, **kw):
    meta = ObjectMeta()
    meta['typename'] = 'vineyard::DataFrame'
    cols = kw.get('cols')
    label = kw.get('label')
    meta['label'] = to_json(label)
    meta['columns_'] = to_json(cols)
    for i in range(len(cols)):
        ls = []
        for x, y in value:
            if cols[i] == label:
                ls.append(y.numpy())
            else:
                ls.append(x[i].numpy())
        meta['__values_-key-%d' % i] = to_json(cols[i])
        meta.add_member('__values_-value-%d' % i, builder.run(client, ls))
    meta['__values_-size'] = len(cols)
    meta['partition_index_row_'] = kw.get('partition_index', [0, 0])[0]
    meta['partition_index_column_'] = kw.get('partition_index', [0, 0])[1]
    meta['row_batch_index_'] = kw.get('row_batch_index', 0)
    return client.create_metadata(meta)
示例#15
0
def tf_dataframe_builder(client, value, builder, **kw):
    meta = ObjectMeta()
    meta['typename'] = 'vineyard::DataFrame'
    for feat, labels in value.take(1):
        cols = list(feat.keys())
    cols.append('label')
    meta['columns_'] = to_json(cols)
    for i in range(len(cols)):
        ls = []
        for feat, labels in value.take(len(value)):
            if cols[i] == 'label':
                ls.append(labels.numpy())
            else:
                ls.append(feat[cols[i]].numpy())
        meta['__values_-key-%d' % i] = to_json(cols[i])
        meta.add_member('__values_-value-%d' % i, builder.run(client, ls))
    meta['__values_-size'] = len(cols)
    meta['partition_index_row_'] = kw.get('partition_index', [0, 0])[0]
    meta['partition_index_column_'] = kw.get('partition_index', [0, 0])[1]
    meta['row_batch_index_'] = kw.get('row_batch_index', 0)
    return client.create_metadata(meta)
示例#16
0
def pandas_dataframe_builder(client, value, builder, **kw):
    meta = ObjectMeta()
    meta['typename'] = 'vineyard::DataFrame'
    meta['columns_'] = to_json(value.columns.values.tolist())
    meta.add_member('index_', builder.run(client, value.index))

    # accumulate columns
    value_columns = [None] * len(value.columns)
    for block in value._mgr.blocks:
        slices = list(expand_slice(block.mgr_locs.indexer))
        if isinstance(block.values, pd.arrays.SparseArray):
            assert len(slices) == 1
            value_columns[slices[0]] = block.values
        elif len(slices) == 1:
            value_columns[slices[0]] = block.values[0]
            vineyard_ref = getattr(block.values, '__vineyard_ref', None)
            # the block comes from vineyard
            if vineyard_ref is not None:
                setattr(value_columns[slices[0]], '__vineyard_ref',
                        vineyard_ref)
        else:
            for index, column_index in enumerate(slices):
                value_columns[column_index] = block.values[index]

    for index, name in enumerate(value.columns):
        meta['__values_-key-%d' % index] = to_json(name)
        meta.add_member('__values_-value-%d' % index,
                        builder.run(client, value_columns[index]))
    meta['nbytes'] = 0  # FIXME
    meta['__values_-size'] = len(value.columns)
    meta['partition_index_row_'] = kw.get('partition_index', [0, 0])[0]
    meta['partition_index_column_'] = kw.get('partition_index', [0, 0])[1]
    meta['row_batch_index_'] = kw.get('row_batch_index', 0)
    return client.create_metadata(meta)
示例#17
0
def pandas_series_builder(client, value, builder, **kw):
    meta = ObjectMeta()
    meta['typename'] = 'vineyard::Series'
    meta['name'] = to_json(value.name)
    meta.add_member('index_', builder.run(client, value.index))
    meta.add_member('value_', builder.run(client, value.to_numpy(), **kw))
    return client.create_metadata(meta)
示例#18
0
def merge_global_object(vineyard_endpoint,
                        results: List[List[ObjectID]]) -> ObjectID:
    if results is None or len(results) == 0:
        raise ValueError("No available sub objects to merge")

    chunks = []
    for subresults in results:
        chunks.extend(subresults)

    if len(chunks) == 0:
        raise ValueError("No available sub objects to merge")

    if len(chunks) == 1:
        # fastpath: no need to merge
        if not isinstance(chunks[0], ObjectID):
            return ObjectID(chunks[0])
        else:
            return chunks[0]

    vineyard_rpc_client = vineyard.connect(vineyard_endpoint)
    metadatas = []
    for chunk in chunks:
        if not isinstance(chunk, ObjectID):
            chunk = ObjectID(chunk)
        metadatas.append(vineyard_rpc_client.get_meta(chunk))

    chunkmap, isglobal = dict(), False
    for meta in metadatas:
        if meta.isglobal:
            isglobal = True
            for k, v in meta.items():
                if isinstance(v, ObjectMeta):
                    chunkmap[v.id] = k
        else:
            if isglobal:
                raise ValueError('Not all sub objects are global objects: %s' %
                                 results)

    if not isglobal:
        raise ValueError(
            "Unable to merge more than one non-global objects: %s" % results)

    base_meta = ObjectMeta()
    base_meta.set_global(True)
    for k, v in metadatas[0].items():
        if isinstance(v, ObjectMeta):
            continue
        if k in ['id', 'signature', 'instance_id']:
            continue
        base_meta[k] = v
    for v, k in chunkmap.items():
        base_meta.add_member(k, v)
    meta = vineyard_rpc_client.create_metadata(base_meta)
    vineyard_rpc_client.persist(meta.id)
    return meta.id
示例#19
0
def pandas_sparse_array_builder(client, value, builder, **kw):
    meta = ObjectMeta()
    meta['typename'] = 'vineyard::SparseArray<%s>' % value.dtype.name
    meta['value_type_'] = value.dtype.name
    sp_index_type, (sp_index_size,
                    sp_index_array) = value.sp_index.__reduce__()
    meta['sp_index_name'] = sp_index_type.__name__
    meta['sp_index_size'] = sp_index_size
    meta.add_member('sp_index', builder.run(client, sp_index_array, **kw))
    meta.add_member('sp_values', builder.run(client, value.sp_values, **kw))
    return client.create_metadata(meta)
示例#20
0
def record_batch_builder(client, batch, builder):
    meta = ObjectMeta()
    meta['typename'] = 'vineyard::RecordBatch'
    meta['row_num_'] = batch.num_rows
    meta['column_num_'] = batch.num_columns
    meta['__columns_-size'] = batch.num_columns

    meta.add_member('schema_', schema_proxy_builder(client, batch.schema, builder))
    for idx in range(batch.num_columns):
        meta.add_member('__columns_-%d' % idx, builder.run(client, batch[idx]))
    meta['nbytes'] = batch.nbytes
    return client.create_metadata(meta)
示例#21
0
def table_from_recordbatches(client, schema, batches, num_rows, num_columns, builder):
    meta = ObjectMeta()
    meta['typename'] = 'vineyard::Table'
    meta['num_rows_'] = num_rows
    meta['num_columns_'] = num_columns
    meta['batch_num_'] = len(batches)
    meta['__batches_-size'] = len(batches)

    meta.add_member('schema_', schema_proxy_builder(client, schema, builder))
    for idx, batch in enumerate(batches):
        meta.add_member('__batches_-%d' % idx, batch)
    meta['nbytes'] = 0
    return client.create_metadata(meta)
示例#22
0
def list_array_builder(client, array, builder):
    meta = ObjectMeta()
    meta['typename'] = 'vineyard::LargeListArray'
    meta['length_'] = len(array)
    meta['null_count_'] = array.null_count
    meta['offset_'] = array.offset

    if isinstance(array, pa.ListArray):
        buffer = array.buffers()[1]
        length = len(buffer) // (pa.uint32().bit_width // 8)
        offset_array = pa.Array.from_buffers(pa.uint32(), length, [None, buffer])
        offset_array = offset_array.cast(pa.uint64())
        offset_buffer = offset_array.buffers()[1]
    else:  # is pa.LargeListArray
        offset_buffer = array.buffers()[1]

    meta.add_member('null_bitmap_', buffer_builder(client, array.buffers()[0], builder))
    meta.add_member('buffer_offsets_', buffer_builder(client, offset_buffer, builder))
    meta.add_member('values_', builder.run(client, array.values))
    meta['nbytes'] = array.nbytes
    return client.create_metadata(meta)
示例#23
0
def table_builder(client, table, builder):
    meta = ObjectMeta()
    meta['typename'] = 'vineyard::Table'
    meta['num_rows_'] = table.num_rows
    meta['num_columns_'] = table.num_columns
    batches = table.to_batches()
    meta['batch_num_'] = len(batches)
    meta['__batches_-size'] = len(batches)

    meta.add_member('schema_', schema_proxy_builder(client, table.schema, builder))
    for idx, batch in enumerate(batches):
        meta.add_member('__batches_-%d' % idx, record_batch_builder(client, batch, builder))
    meta['nbytes'] = table.nbytes
    return client.create_metadata(meta)
示例#24
0
def numeric_array_builder(client, array, builder):
    meta = ObjectMeta()
    meta['typename'] = 'vineyard::NumericArray<%s>' % array.type
    meta['length_'] = len(array)
    meta['null_count_'] = array.null_count
    meta['offset_'] = array.offset

    null_bitmap = buffer_builder(client, array.buffers()[0], builder)
    buffer = buffer_builder(client, array.buffers()[1], builder)

    meta.add_member('buffer_', buffer)
    meta.add_member('null_bitmap_', null_bitmap)
    meta['nbytes'] = array.nbytes
    return client.create_metadata(meta)
示例#25
0
def fixed_size_binary_array_builder(client, array, builder):
    meta = ObjectMeta()
    meta['typename'] = 'vineyard::FixedSizeBinaryArray'
    meta['length_'] = len(array)
    meta['null_count_'] = array.null_count
    meta['offset_'] = array.offset
    meta['byte_width_'] = array.byte_width

    null_bitmap = buffer_builder(client, array.buffers()[0], builder)
    buffer = buffer_builder(client, array.buffers()[1], builder)

    meta.add_member('buffer_', buffer)
    meta.add_member('null_bitmap_', null_bitmap)
    meta['nbytes'] = array.nbytes
    return client.create_metadata(meta)
示例#26
0
def dali_tensor_builder(client, value, **kw):
    assert dali is not None, "Nvidia DALI is not available"
    meta = ObjectMeta()
    meta['typename'] = 'vineyard::Tensor'
    meta['partition_index_'] = to_json(kw.get('partition_index', []))
    data = np.array(value[0])
    label = np.array(value[1])
    meta.add_member('buffer_data_', build_numpy_buffer(client, data))
    meta.add_member('buffer_label_', build_numpy_buffer(client, label))
    meta['data_shape_'] = to_json(data.shape)
    meta['label_shape_'] = to_json(label.shape)
    meta['data_type_'] = data.dtype.name
    meta['label_type_'] = label.dtype.name
    meta['data_type_meta_'] = data.dtype.str
    meta['label_type_meta_'] = label.dtype.str
    return client.create_metadata(meta)
示例#27
0
def pandas_dataframe_builder(client, value, builder, **kw):
    meta = ObjectMeta()
    meta['typename'] = 'vineyard::DataFrame'
    meta['columns_'] = to_json(value.columns.values.tolist())
    meta.add_member('index_', builder.run(client, value.index))
    for i, (name, column_value) in enumerate(value.iteritems()):
        np_value = column_value.to_numpy(copy=False)
        meta['__values_-key-%d' % i] = to_json(name)
        meta.add_member('__values_-value-%d' % i,
                        builder.run(client, np_value))
    meta['nbytes'] = 0  # FIXME
    meta['__values_-size'] = len(value.columns)
    meta['partition_index_row_'] = kw.get('partition_index', [0, 0])[0]
    meta['partition_index_column_'] = kw.get('partition_index', [0, 0])[1]
    meta['row_batch_index_'] = kw.get('row_batch_index', 0)
    return client.create_metadata(meta)
示例#28
0
def string_array_builder(client, array, builder):
    meta = ObjectMeta()
    meta['typename'] = 'vineyard::StringArray'
    meta['length_'] = len(array)
    meta['null_count_'] = array.null_count
    meta['offset_'] = array.offset

    null_bitmap = buffer_builder(client, array.buffers()[0], builder)
    buffer_offsets = buffer_builder(client, array.buffers()[1], builder)
    buffer_data = buffer_builder(client, array.buffers()[2], builder)

    meta.add_member('buffer_offsets_', buffer_offsets)
    meta.add_member('buffer_data_', buffer_data)
    meta.add_member('null_bitmap_', null_bitmap)
    meta['nbytes'] = array.nbytes
    return client.create_metadata(meta)
示例#29
0
def torch_tensor_builder(client, value, **kw):
    meta = ObjectMeta()
    meta['typename'] = 'vineyard::Tensor'
    meta['partition_index_'] = to_json(kw.get('partition_index', []))
    data = value
    data = DataLoader(data, batch_size=len(value))
    for x, y in data:
        meta.add_member('buffer_data_', build_numpy_buffer(client, x.numpy()))
        meta.add_member('buffer_label_', build_numpy_buffer(client, y.numpy()))
        meta['data_shape_'] = to_json(x.numpy().shape)
        meta['label_shape_'] = to_json(y.numpy().shape)
        meta['data_type_'] = x.numpy().dtype.name
        meta['label_type_'] = y.numpy().dtype.name
        meta['data_type_meta_'] = x.numpy().dtype.str
        meta['label_type_meta_'] = y.numpy().dtype.str
    return client.create_metadata(meta)
示例#30
0
def make_global_dataframe(client, blocks, extra_meta=None) -> ObjectMeta:
    meta = ObjectMeta()
    meta['typename'] = 'vineyard::GlobalDataFrame'
    meta.set_global(True)
    meta['partitions_-size'] = len(blocks)
    if extra_meta:
        for k, v in extra_meta.items():
            meta[k] = v

    for idx, block in enumerate(blocks):
        if not isinstance(block, (ObjectMeta, ObjectID, Object)):
            block = ObjectID(block)
        meta.add_member('partitions_-%d' % idx, block)

    gtensor_meta = client.create_metadata(meta)
    client.persist(gtensor_meta)
    return gtensor_meta