Python serialize 예제들, pyarrow.serialize Python 예제들

예제 #1

0

파일 보기

파일: test_serialization.py 프로젝트: emkornfield/arrow

def test_clone():
    context = pa.SerializationContext()

    class Foo(object):
        pass

    def custom_serializer(obj):
        return 0

    def custom_deserializer(serialized_obj):
        return (serialized_obj, 'a')

    context.register_type(Foo, 'Foo', custom_serializer=custom_serializer,
                          custom_deserializer=custom_deserializer)

    new_context = context.clone()

    f = Foo()
    serialized = pa.serialize(f, context=context)
    deserialized = serialized.deserialize(context=context)
    assert deserialized == (0, 'a')

    serialized = pa.serialize(f, context=new_context)
    deserialized = serialized.deserialize(context=new_context)
    assert deserialized == (0, 'a')

예제 #2

0

파일 보기

파일: test_serialization.py 프로젝트: emkornfield/arrow

def test_serialize_recursive_objects():
    class ClassA(object):
        pass

    # Make a list that contains itself.
    lst = []
    lst.append(lst)

    # Make an object that contains itself as a field.
    a1 = ClassA()
    a1.field = a1

    # Make two objects that contain each other as fields.
    a2 = ClassA()
    a3 = ClassA()
    a2.field = a3
    a3.field = a2

    # Make a dictionary that contains itself.
    d1 = {}
    d1["key"] = d1

    # Make a numpy array that contains itself.
    arr = np.array([None], dtype=object)
    arr[0] = arr

    # Create a list of recursive objects.
    recursive_objects = [lst, a1, a2, a3, d1, arr]

    # Check that exceptions are thrown when we serialize the recursive
    # objects.
    for obj in recursive_objects:
        with pytest.raises(Exception):
            pa.serialize(obj).deserialize()

예제 #3

0

파일 보기

파일: test_serialization.py 프로젝트: emkornfield/arrow

def test_serialization_callback_error():

    class TempClass(object):
        pass

    # Pass a SerializationContext into serialize, but TempClass
    # is not registered
    serialization_context = pa.SerializationContext()
    val = TempClass()
    with pytest.raises(pa.SerializationCallbackError) as err:
        serialized_object = pa.serialize(val, serialization_context)
    assert err.value.example_object == val

    serialization_context.register_type(TempClass, "TempClass")
    serialized_object = pa.serialize(TempClass(), serialization_context)
    deserialization_context = pa.SerializationContext()

    # Pass a Serialization Context into deserialize, but TempClass
    # is not registered
    with pytest.raises(pa.DeserializationCallbackError) as err:
        serialized_object.deserialize(deserialization_context)
    assert err.value.type_id == "TempClass"

    class TempClass2(object):
        pass

    # Make sure that we receive an error when we use an inappropriate value for
    # the type_id argument.
    with pytest.raises(TypeError):
        serialization_context.register_type(TempClass2, 1)

예제 #4

0

파일 보기

파일: test_serialization.py 프로젝트: emkornfield/arrow

def test_numpy_subclass_serialization():
    # Check that we can properly serialize subclasses of np.ndarray.
    class CustomNDArray(np.ndarray):
        def __new__(cls, input_array):
            array = np.asarray(input_array).view(cls)
            return array

    def serializer(obj):
        return {'numpy': obj.view(np.ndarray)}

    def deserializer(data):
        array = data['numpy'].view(CustomNDArray)
        return array

    context = pa.default_serialization_context()

    context.register_type(CustomNDArray, 'CustomNDArray',
                          custom_serializer=serializer,
                          custom_deserializer=deserializer)

    x = CustomNDArray(np.zeros(3))
    serialized = pa.serialize(x, context=context).to_buffer()
    new_x = pa.deserialize(serialized, context=context)
    assert type(new_x) == CustomNDArray
    assert np.alltrue(new_x.view(np.ndarray) == np.zeros(3))

예제 #5

0

파일 보기

파일: test_serialization.py 프로젝트: emkornfield/arrow

def test_serialize_subclasses():

    # This test shows how subclasses can be handled in an idiomatic way
    # by having only a serializer for the base class

    # This technique should however be used with care, since pickling
    # type(obj) with couldpickle will include the full class definition
    # in the serialized representation.
    # This means the class definition is part of every instance of the
    # object, which in general is not desirable; registering all subclasses
    # with register_type will result in faster and more memory
    # efficient serialization.

    context = pa.default_serialization_context()
    context.register_type(
        Serializable, "Serializable",
        custom_serializer=serialize_serializable,
        custom_deserializer=deserialize_serializable)

    a = SerializableClass()
    serialized = pa.serialize(a, context=context)

    deserialized = serialized.deserialize(context=context)
    assert type(deserialized).__name__ == SerializableClass.__name__
    assert deserialized.value == 3

예제 #6

0

파일 보기

파일: test_serialization.py 프로젝트: emkornfield/arrow

def _check_component_roundtrip(value, context=global_serialization_context):
    # Test to/from components
    serialized = pa.serialize(value, context=context)
    components = serialized.to_components()
    from_comp = pa.SerializedPyObject.from_components(components)
    recons = from_comp.deserialize(context=context)
    assert_equal(value, recons)

예제 #7

0

파일 보기

파일: serialize.py 프로젝트: ahuirecome/tensorpack

def dumps_pyarrow(obj):
    """
    Serialize an object.

    Returns:
        Implementation-dependent bytes-like object
    """
    return pa.serialize(obj).to_buffer()

예제 #8

0

파일 보기

파일: test_serialization.py 프로젝트: emkornfield/arrow

def test_integer_limits(large_buffer):
    # Check that Numpy scalars can be represented up to their limit values
    # (except np.uint64 which is limited to 2**63 - 1)
    for dt in [np.int8, np.int64, np.int32, np.int64,
               np.uint8, np.uint64, np.uint32, np.uint64]:
        scal = dt(np.iinfo(dt).min)
        serialization_roundtrip(scal, large_buffer)
        if dt is not np.uint64:
            scal = dt(np.iinfo(dt).max)
            serialization_roundtrip(scal, large_buffer)
        else:
            scal = dt(2**63 - 1)
            serialization_roundtrip(scal, large_buffer)
            for v in (2**63, 2**64 - 1):
                scal = dt(v)
                with pytest.raises(pa.ArrowInvalid):
                    pa.serialize(scal)

예제 #9

0

파일 보기

파일: compression.py 프로젝트: robertnishihara/ray

def pack(data):
    if LZ4_ENABLED:
        data = pyarrow.serialize(data).to_buffer().to_pybytes()
        data = lz4.frame.compress(data)
        # TODO(ekl) we shouldn't need to base64 encode this data, but this
        # seems to not survive a transfer through the object store if we don't.
        data = base64.b64encode(data).decode("ascii")
    return data

예제 #10

0

파일 보기

파일: test_serialization.py 프로젝트: emkornfield/arrow

def test_serialization_callback_numpy():

    class DummyClass(object):
        pass

    def serialize_dummy_class(obj):
        x = np.zeros(4)
        return x

    def deserialize_dummy_class(serialized_obj):
        return serialized_obj

    context = pa.default_serialization_context()
    context.register_type(DummyClass, "DummyClass",
                          custom_serializer=serialize_dummy_class,
                          custom_deserializer=deserialize_dummy_class)

    pa.serialize(DummyClass(), context=context)

예제 #11

0

파일 보기

파일: compression.py 프로젝트: adgirish/ray

def pack(data):
    if SNAPPY_ENABLED:
        data = snappy.compress(
            pyarrow.serialize(data).to_buffer().to_pybytes())
        # TODO(ekl) we shouldn't need to base64 encode this data, but this
        # seems to not survive a transfer through the object store if we don't.
        return base64.b64encode(data)
    else:
        return data

예제 #12

0

파일 보기

파일: test_ipc.py 프로젝트: NonVolatileComputing/arrow

def test_serialize_with_pandas_objects():
    df = pd.DataFrame({'a': [1, 2, 3]}, index=[1, 2, 3])

    data = {
        'a_series': df['a'],
        'a_frame': df
    }

    serialized = pa.serialize(data).to_buffer()
    deserialized = pa.deserialize(serialized)
    assert_frame_equal(deserialized['a_frame'], df)
    assert_series_equal(deserialized['a_series'], df['a'])

예제 #13

0

파일 보기

파일: test_serialization.py 프로젝트: emkornfield/arrow

def test_tensor_alignment():
    # Deserialized numpy arrays should be 64-byte aligned.
    x = np.random.normal(size=(10, 20, 30))
    y = pa.deserialize(pa.serialize(x).to_buffer())
    assert y.ctypes.data % 64 == 0

    xs = [np.random.normal(size=i) for i in range(100)]
    ys = pa.deserialize(pa.serialize(xs).to_buffer())
    for y in ys:
        assert y.ctypes.data % 64 == 0

    xs = [np.random.normal(size=i * (1,)) for i in range(20)]
    ys = pa.deserialize(pa.serialize(xs).to_buffer())
    for y in ys:
        assert y.ctypes.data % 64 == 0

    xs = [np.random.normal(size=i * (5,)) for i in range(1, 8)]
    xs = [xs[i][(i + 1) * (slice(1, 3),)] for i in range(len(xs))]
    ys = pa.deserialize(pa.serialize(xs).to_buffer())
    for y in ys:
        assert y.ctypes.data % 64 == 0

예제 #14

0

파일 보기

파일: test_serialization.py 프로젝트: NonVolatileComputing/arrow

def test_serialization_callback_error():

    class TempClass(object):
            pass

    # Pass a SerializationContext into serialize, but TempClass
    # is not registered
    serialization_context = pa.SerializationContext()
    val = TempClass()
    with pytest.raises(pa.SerializationCallbackError) as err:
        serialized_object = pa.serialize(val, serialization_context)
    assert err.value.example_object == val

    serialization_context.register_type(TempClass, 20*b"\x00")
    serialized_object = pa.serialize(TempClass(), serialization_context)
    deserialization_context = pa.SerializationContext()

    # Pass a Serialization Context into deserialize, but TempClass
    # is not registered
    with pytest.raises(pa.DeserializationCallbackError) as err:
        serialized_object.deserialize(deserialization_context)
    assert err.value.type_id == 20*b"\x00"

예제 #15

0

파일 보기

파일: test_serialization.py 프로젝트: rok/arrow

def test_deserialize_buffer_in_different_process():
    import tempfile

    f = tempfile.NamedTemporaryFile(delete=False)
    b = pa.serialize(pa.py_buffer(b'hello')).to_buffer()
    f.write(b.to_pybytes())
    f.close()

    subprocess_env = test_util.get_modified_env_with_pythonpath()

    dir_path = os.path.dirname(os.path.realpath(__file__))
    python_file = os.path.join(dir_path, 'deserialize_buffer.py')
    subprocess.check_call([sys.executable, python_file, f.name],
                          env=subprocess_env)

예제 #16

0

파일 보기

파일: test_serialization.py 프로젝트: emkornfield/arrow

def test_set_pickle():
    # Use a custom type to trigger pickling.
    class Foo(object):
        pass

    context = pa.SerializationContext()
    context.register_type(Foo, 'Foo', pickle=True)

    test_object = Foo()

    # Define a custom serializer and deserializer to use in place of pickle.

    def dumps1(obj):
        return b'custom'

    def loads1(serialized_obj):
        return serialized_obj + b' serialization 1'

    # Test that setting a custom pickler changes the behavior.
    context.set_pickle(dumps1, loads1)
    serialized = pa.serialize(test_object, context=context).to_buffer()
    deserialized = pa.deserialize(serialized.to_pybytes(), context=context)
    assert deserialized == b'custom serialization 1'

    # Define another custom serializer and deserializer.

    def dumps2(obj):
        return b'custom'

    def loads2(serialized_obj):
        return serialized_obj + b' serialization 2'

    # Test that setting another custom pickler changes the behavior again.
    context.set_pickle(dumps2, loads2)
    serialized = pa.serialize(test_object, context=context).to_buffer()
    deserialized = pa.deserialize(serialized.to_pybytes(), context=context)
    assert deserialized == b'custom serialization 2'

예제 #17

0

파일 보기

파일: test_serialization.py 프로젝트: emkornfield/arrow

def test_numpy_base_object(tmpdir):
    # ARROW-2040: deserialized Numpy array should keep a reference to the
    # owner of its memory
    path = os.path.join(str(tmpdir), 'zzz.bin')
    data = np.arange(12, dtype=np.int32)

    with open(path, 'wb') as f:
        f.write(pa.serialize(data).to_buffer())

    serialized = pa.read_serialized(pa.OSFile(path))
    result = serialized.deserialize()
    assert_equal(result, data)
    serialized = None
    assert_equal(result, data)
    assert result.base is not None

예제 #18

0

파일 보기

파일: test_serialization.py 프로젝트: NonVolatileComputing/arrow

def test_buffer_serialization():

    class BufferClass(object):
        pass

    def serialize_buffer_class(obj):
        return pa.frombuffer(b"hello")

    def deserialize_buffer_class(serialized_obj):
        return serialized_obj

    pa._default_serialization_context.register_type(
        BufferClass, "BufferClass", pickle=False,
        custom_serializer=serialize_buffer_class,
        custom_deserializer=deserialize_buffer_class)

    b = pa.serialize(BufferClass()).to_buffer()
    assert pa.deserialize(b).to_pybytes() == b"hello"

예제 #19

0

파일 보기

파일: test_ipc.py 프로젝트: giantwhale/arrow

def test_serialize_with_pandas_objects():
    df = pd.DataFrame({'a': [1, 2, 3]}, index=[1, 2, 3])
    s = pd.Series([1, 2, 3, 4])

    data = {
        'a_series': df['a'],
        'a_frame': df,
        's_series': s
    }

    serialized = pa.serialize(data).to_buffer()
    deserialized = pa.deserialize(serialized)
    assert_frame_equal(deserialized['a_frame'], df)

    assert_series_equal(deserialized['a_series'], df['a'])
    assert deserialized['a_series'].name == 'a'

    assert_series_equal(deserialized['s_series'], s)
    assert deserialized['s_series'].name is None

예제 #20

0

파일 보기

파일: test_serialization.py 프로젝트: emkornfield/arrow

def test_buffer_serialization():

    class BufferClass(object):
        pass

    def serialize_buffer_class(obj):
        return pa.py_buffer(b"hello")

    def deserialize_buffer_class(serialized_obj):
        return serialized_obj

    context = pa.default_serialization_context()
    context.register_type(
        BufferClass, "BufferClass",
        custom_serializer=serialize_buffer_class,
        custom_deserializer=deserialize_buffer_class)

    b = pa.serialize(BufferClass(), context=context).to_buffer()
    assert pa.deserialize(b, context=context).to_pybytes() == b"hello"

예제 #21

0

파일 보기

파일: test_serialization.py 프로젝트: emkornfield/arrow

def test_fallback_to_subclasses():

    class SubFoo(Foo):
        def __init__(self):
            Foo.__init__(self)

    # should be able to serialize/deserialize an instance
    # if a base class has been registered
    serialization_context = pa.SerializationContext()
    serialization_context.register_type(Foo, "Foo")

    subfoo = SubFoo()
    # should fallbact to Foo serializer
    serialized_object = pa.serialize(subfoo, serialization_context)

    reconstructed_object = serialized_object.deserialize(
        serialization_context
    )
    assert type(reconstructed_object) == Foo

예제 #22

0

파일 보기

파일: test_serialization.py 프로젝트: wesm/arrow

def test_numpy_matrix_serialization(tmpdir):
    class CustomType(object):
        def __init__(self, val):
            self.val = val

    path = os.path.join(str(tmpdir), 'pyarrow_npmatrix_serialization_test.bin')
    array = np.random.randint(low=-1, high=1, size=(2, 2))

    for data_type in [str, int, float, CustomType]:
        matrix = np.matrix(array.astype(data_type))

        with open(path, 'wb') as f:
            f.write(pa.serialize(matrix).to_buffer())

        serialized = pa.read_serialized(pa.OSFile(path))
        result = serialized.deserialize()
        assert_equal(result, matrix)
        assert_equal(result.dtype, matrix.dtype)
        serialized = None
        assert_equal(result, matrix)
        assert result.base is not None

예제 #23

0

파일 보기

파일: test_serialization.py 프로젝트: emkornfield/arrow

def test_deserialize_in_different_process():
    from multiprocessing import Process, Queue
    import re

    regex = re.compile(r"\d+\.\d*")

    serialization_context = pa.SerializationContext()
    serialization_context.register_type(type(regex), "Regex", pickle=True)

    serialized = pa.serialize(regex, serialization_context)
    serialized_bytes = serialized.to_buffer().to_pybytes()

    def deserialize_regex(serialized, q):
        import pyarrow as pa
        q.put(pa.deserialize(serialized))

    q = Queue()
    p = Process(target=deserialize_regex, args=(serialized_bytes, q))
    p.start()
    assert q.get().pattern == regex.pattern
    p.join()

예제 #24

0

파일 보기

파일: test_serialization.py 프로젝트: rok/arrow

def test_deserialize_components_in_different_process():
    arr = pa.array([1, 2, 5, 6], type=pa.int8())
    ser = pa.serialize(arr)
    data = pickle.dumps(ser.to_components(), protocol=-1)

    code = """if 1:
        import pickle

        import pyarrow as pa

        data = {0!r}
        components = pickle.loads(data)
        arr = pa.deserialize_components(components)

        assert arr.to_pylist() == [1, 2, 5, 6], arr
        """.format(data)

    subprocess_env = test_util.get_modified_env_with_pythonpath()
    print("** sys.path =", sys.path)
    print("** setting PYTHONPATH to:", subprocess_env['PYTHONPATH'])
    subprocess.check_call(["python", "-c", code], env=subprocess_env)

예제 #25

0

파일 보기

파일: dataserializer.py 프로젝트: tomzhang/mars-1

def serialize(data):
    return pyarrow.serialize(data, mars_serialize_context())

예제 #26

0

파일 보기

파일: fileio.py 프로젝트: ryanmwhitephd/poc-pachyderm

context = pa.default_serialization_context()

start_time = timeit.default_timer()
serialized_df = context.serialize(df)
print(timeit.default_timer() - start_time)

df_components = serialized_df.to_components()

start_time = timeit.default_timer()
original_df = context.deserialize_components(df_components)
print(timeit.default_timer() - start_time)

original_df
data = {i: np.random.randn(500, 500) for i in range(100)}
buf = pa.serialize(data).to_buffer()
type(buf)
buf.size
restored_data = pa.deserialize(buf)
restored_data[0]
feather.write_feather(df, 'example.feather')
read_df = feather.read_feather('example.feather')
with open('example2.feather', 'wb') as f:
    feather.write_feather(df, f)

with open('example2.feather', 'rb') as f:
    read_df = feather.read_feather(f)

# StringIO buffer
buffer = StringIO()
df.to_csv(buffer)

예제 #27

0

파일 보기

파일: compare_serialization.py 프로젝트: eachsaj/cat4py

print()

t0 = time()
sframe_nocopy = carr.sframe
t1 = time()
print("Time for serializing array in-memory (caterva, no-copy): %.3fs" %
      (t1 - t0))

t0 = time()
sframe_copy = carr.to_sframe()
t1 = time()
print("Time for serializing array in-memory (caterva, copy): %.3fs" %
      (t1 - t0))

t0 = time()
serialized = pa.serialize(arr)
pyarrow_nocopy = serialized.to_components()
t1 = time()
print("Time for serializing array in-memory (arrow, no-copy): %.3fs" %
      (t1 - t0))

t0 = time()
pyarrow_copy = pa.serialize(arr).to_buffer().to_pybytes()
t1 = time()
print("Time for serializing array in-memory (arrow, copy): %.3fs" % (t1 - t0))

t0 = time()
frame_pickle = pickle.dumps(arr, protocol=4)
t1 = time()
print("Time for serializing array in-memory (pickle4, copy): %.3fs" %
      (t1 - t0))

예제 #28

0

파일 보기

파일: local_disk_arrow_table_cache.py 프로젝트: zhangruiskyline/petastorm

 def __init__(self, *args, **kwargs):
     super(LocalDiskArrowTableCache, self).__init__(*args, **kwargs)
     # Workaround for https://issues.apache.org/jira/browse/ARROW-5260
     # unless we try to serialize something before deserialize_components is called, we would crash with a sigsegv
     pa.serialize(0)

예제 #29

0

파일 보기

def dumps_pyarrow(obj):
    return pyarrow.serialize(obj).to_buffer()

예제 #30

0

파일 보기

파일: test_serialization.py 프로젝트: emkornfield/arrow

def test_serialize_to_buffer():
    for nthreads in [1, 4]:
        for value in COMPLEX_OBJECTS:
            buf = pa.serialize(value).to_buffer(nthreads=nthreads)
            result = pa.deserialize(buf)
            assert_equal(value, result)

예제 #31

0

파일 보기

def valueCart(valueInput, valueLabel):
    return compress(serialize((valueInput, valueLabel)).to_buffer())

예제 #32

0

파일 보기

파일: dataset.py 프로젝트: fioushen/ggnn.tensorflow

    def __init__(self,
                 opt,
                 is_training=True,
                 is_testing=False,
                 live_test=False):

        self.node_dim = opt.node_dim
        self.state_dim = opt.state_dim

        self.is_training = is_training
        self.is_testing = is_testing

        if live_test:

            all_data_node_id, all_data_node_type = load_single_program(
                opt.test_graph_path)
            all_data_node_id = np.array(
                all_data_node_id)[0:len(all_data_node_id)]
            all_data_node_type = np.array(
                all_data_node_type)[0:len(all_data_node_type)]

        else:
            base_name = os.path.basename(opt.path)

            if is_training:
                saved_input_filename = "%s/%s-%d-train.pkl" % (
                    opt.path, base_name, opt.n_classes)
            if is_testing:
                saved_input_filename = "%s/%s-%d-test.pkl" % (
                    opt.path, base_name, opt.n_classes)

            if os.path.exists(saved_input_filename):
                input_file = open(saved_input_filename, 'rb')
                buf = input_file.read()
                all_data_node_id, all_data_node_type = pyarrow.deserialize(buf)
                input_file.close()
            else:
                all_data_node_id, all_data_node_type = load_program_graphs_from_directory(
                    opt.path, is_training, is_testing, opt.n_classes)
                all_data_node_id = np.array(
                    all_data_node_id)[0:len(all_data_node_id)]
                all_data_node_type = np.array(
                    all_data_node_type)[0:len(all_data_node_type)]
                buf = pyarrow.serialize(
                    (all_data_node_id, all_data_node_type)).to_buffer()
                out = pyarrow.OSFile(saved_input_filename, 'wb')
                out.write(buf)
                out.close()

        self.pretrained_embeddings = opt.pretrained_embeddings
        self.batch_size = opt.train_batch_size

        label_lookup = {
            label: _onehot(label, opt.n_classes)
            for label in range(0, opt.n_classes)
        }

        self.label_lookup = label_lookup
        # if is_train == True:
        print("Number of all data : " + str(len(all_data_node_id)))
        # else:
        # print("Number of all testing data : " + str(len(all_data_node_id)))
        # self.n_edge_types =  find_max_edge_id(all_data_node_id)
        self.n_edge_types = 7
        # print("Edge types : " + str(self.n_edge_types))
        max_node_id = find_max_node_id(all_data_node_id)
        min_node_id = find_min_node_id(all_data_node_id)
        print("Max node id in data : " + str(max_node_id))
        print("Min node id in data : " + str(min_node_id))
        max_node_type = find_max_node_id(all_data_node_type)
        min_node_type = find_min_node_id(all_data_node_type)
        print("Max node type in data : " + str(max_node_type))
        print("Min node type in data : " + str(min_node_type))
        # print("Max node id : " + str(max_node_id))
        # print("Max node type : " + str(max_node_type))

        self.n_node_by_id = max_node_id
        self.n_node_by_type = max_node_type

        all_data_node_id = convert_program_data(all_data_node_id)
        all_data_node_type = convert_program_data(all_data_node_type)

        self.all_data_node_id = all_data_node_id
        self.all_data_node_type = all_data_node_type

        self.data = self.process_raw_graphs()

예제 #33

0

파일 보기

def pa_serialize(obj):
    return pyarrow.serialize(obj).to_buffer()

예제 #34

0

파일 보기

def make_lmdb_gesture_dataset(base_path):
    gesture_path = os.path.join(base_path, 'Motion')
    audio_path = os.path.join(base_path, 'Audio')
    text_path = os.path.join(base_path, 'Transcripts')
    out_path = os.path.join(base_path, 'lmdb')
    if not os.path.exists(out_path):
        os.makedirs(out_path)

    map_size = 1024 * 20  # in MB
    map_size <<= 20  # in B
    db = [
        lmdb.open(os.path.join(out_path, 'lmdb_train'), map_size=map_size),
        lmdb.open(os.path.join(out_path, 'lmdb_test'), map_size=map_size)
    ]

    # delete existing files
    for i in range(2):
        with db[i].begin(write=True) as txn:
            txn.drop(db[i].open_db())

    all_poses = []
    bvh_files = sorted(glob.glob(gesture_path + "/*.bvh"))
    for v_i, bvh_file in enumerate(bvh_files):
        name = os.path.split(bvh_file)[1][:-4]
        print(name)

        # load skeletons and subtitles
        poses, poses_mirror = process_bvh(bvh_file)
        subtitle = SubtitleWrapper(os.path.join(text_path,
                                                name + '.json')).get()

        # load audio
        audio_raw, audio_sr = librosa.load(os.path.join(
            audio_path, '{}.wav'.format(name)),
                                           mono=True,
                                           sr=16000,
                                           res_type='kaiser_fast')

        # process
        clips = [
            {
                'vid': name,
                'clips': []
            },  # train
            {
                'vid': name,
                'clips': []
            }
        ]  # validation

        # split
        if v_i == 0:
            dataset_idx = 1  # validation
        else:
            dataset_idx = 0  # train

        # word preprocessing
        word_list = []
        for wi in range(len(subtitle)):
            word_s = float(subtitle[wi]['start_time'][:-1])
            word_e = float(subtitle[wi]['end_time'][:-1])
            word = subtitle[wi]['word']

            word = normalize_string(word)
            if len(word) > 0:
                word_list.append([word, word_s, word_e])

        # save subtitles and skeletons
        poses = np.asarray(poses, dtype=np.float16)
        clips[dataset_idx]['clips'].append({
            'words': word_list,
            'poses': poses,
            'audio_raw': audio_raw
        })
        poses_mirror = np.asarray(poses_mirror, dtype=np.float16)
        clips[dataset_idx]['clips'].append({
            'words': word_list,
            'poses': poses_mirror,
            'audio_raw': audio_raw
        })

        # write to db
        for i in range(2):
            with db[i].begin(write=True) as txn:
                if len(clips[i]['clips']) > 0:
                    k = '{:010}'.format(v_i).encode('ascii')
                    v = pyarrow.serialize(clips[i]).to_buffer()
                    txn.put(k, v)

        all_poses.append(poses)

    # close db
    for i in range(2):
        db[i].sync()
        db[i].close()

    # calculate data mean
    all_poses = np.vstack(all_poses)
    pose_mean = np.mean(all_poses, axis=0)
    pose_std = np.std(all_poses, axis=0)

    print('data mean/std')
    print(str(["{:0.5f}".format(e) for e in pose_mean]).replace("'", ""))
    print(str(["{:0.5f}".format(e) for e in pose_std]).replace("'", ""))

예제 #35

0

파일 보기

파일: serialize.py 프로젝트: overshiki/datasets

def serialize(data):
	buf = pa.serialize(data).to_buffer()
	return buf

예제 #36

0

파일 보기

def benchmark(procnum, send_end):
    resnet_url = "http://54.87.17.51:1337/resnet101-app/predict"
    inception_url = "http://3.235.164.133:1337/inceptionv3-app/predict"
    predict_url = "http://34.232.48.232:1337/predict-app/predict"

    headers = {"Content-type": "application/json"}
    latencies = []
    post_serial_latencies = []
    resnet_latencies = []
    incept_latencies = []
    # pred_latencies = []
    y = os.listdir("imagenet_sample/imagenet/")
    x = random.choices(y, k=1000)
    count = 0
    for filename in x:
        start = time.time()
        #print(filename)
        # Creating image input
        req_json = json.dumps({
            "input":
            base64.b64encode(
                open("imagenet_sample/imagenet/" + filename,
                     "rb").read()).decode()
        })
        serial_start = time.time()
        #Calling resnet
        r = requests.post(resnet_url, headers=headers, data=req_json)
        resnet_output = r.json()['output']
        if r.json()['default']:
            print("ERROR", os.getpid())
            return
        resnet_end = time.time()

        #Calling inception
        incept_start = time.time()
        input_bytes = pa.serialize([
            np.asarray(
                Image.open("imagenet_sample/imagenet/" +
                           filename).convert("RGB")), resnet_output[1]
        ]).to_buffer().to_pybytes()
        req_json = json.dumps(
            {"input": base64.b64encode(input_bytes).decode()})
        r2 = requests.post(inception_url, headers=headers, data=req_json)
        inception_output = r2.json()['output']
        incept_end = time.time()

        #calling predict
        predict_input = [
            resnet_output[1], inception_output[1],
            float(resnet_output[0]),
            float(inception_output[1])
        ]

        req_json = json.dumps({"input": predict_input})
        r3 = requests.post(predict_url, headers=headers, data=req_json)

        end = time.time()
        incept_end = end
        latency = (end - start)
        #print("'%s', %f ms" % (r.text, latency))
        latencies.append(latency)
        resnet_latencies.append(resnet_end - serial_start)
        incept_latencies.append(incept_end - resnet_end)
        post_serial_latencies.append(end - serial_start)
    send_end.send([
        latencies, post_serial_latencies, count, resnet_latencies,
        incept_latencies
    ])

예제 #37

0

파일 보기

def test_serialize_to_buffer():
    for nthreads in [1, 4]:
        for value in COMPLEX_OBJECTS:
            buf = pa.serialize(value).to_buffer(nthreads=nthreads)
            result = pa.deserialize(buf)
            assert_equal(value, result)

예제 #38

0

파일 보기

파일: test_serialization.py 프로젝트: emkornfield/arrow

def test_serialization_determinism():
    for obj in COMPLEX_OBJECTS:
        buf1 = pa.serialize(obj).to_buffer()
        buf2 = pa.serialize(obj).to_buffer()
        assert buf1.to_pybytes() == buf2.to_pybytes()

예제 #39

0

파일 보기

def test_serialization_determinism():
    for obj in COMPLEX_OBJECTS:
        buf1 = pa.serialize(obj).to_buffer()
        buf2 = pa.serialize(obj).to_buffer()
        assert buf1.to_pybytes() == buf2.to_pybytes()

예제 #40

0

파일 보기

파일: data_preprocessor.py 프로젝트: wingdi/Gesture-Generation-from-Trimodal-Context

    def _sample_from_clip(self, vid, clip):
        clip_skeleton = clip['skeletons_3d']
        clip_audio = clip['audio_feat']
        clip_audio_raw = clip['audio_raw']
        clip_word_list = clip['words']
        clip_s_f, clip_e_f = clip['start_frame_no'], clip['end_frame_no']
        clip_s_t, clip_e_t = clip['start_time'], clip['end_time']

        n_filtered_out = defaultdict(int)

        # skeleton resampling
        clip_skeleton = utils.data_utils.resample_pose_seq(
            clip_skeleton, clip_e_t - clip_s_t, self.skeleton_resampling_fps)

        # divide
        aux_info = []
        sample_skeletons_list = []
        sample_words_list = []
        sample_audio_list = []
        sample_spectrogram_list = []

        num_subdivision = math.floor(
            (len(clip_skeleton) - self.n_poses) /
            self.subdivision_stride) + 1  # floor((K - (N+M)) / S) + 1
        expected_audio_length = utils.data_utils.calc_spectrogram_length_from_motion_length(
            len(clip_skeleton), self.skeleton_resampling_fps)
        assert abs(expected_audio_length - clip_audio.shape[1]
                   ) <= 5, 'audio and skeleton lengths are different'

        for i in range(num_subdivision):
            start_idx = i * self.subdivision_stride
            fin_idx = start_idx + self.n_poses

            sample_skeletons = clip_skeleton[start_idx:fin_idx]
            subdivision_start_time = clip_s_t + start_idx / self.skeleton_resampling_fps
            subdivision_end_time = clip_s_t + fin_idx / self.skeleton_resampling_fps
            sample_words = self.get_words_in_time_range(
                word_list=clip_word_list,
                start_time=subdivision_start_time,
                end_time=subdivision_end_time)

            # spectrogram
            audio_start = math.floor(start_idx / len(clip_skeleton) *
                                     clip_audio.shape[1])
            audio_end = audio_start + self.spectrogram_sample_length
            if audio_end > clip_audio.shape[
                    1]:  # correct size mismatch between poses and audio
                # logging.info('expanding audio array, audio start={}, end={}, clip_length={}'.format(
                #     audio_start, audio_end, clip_audio.shape[1]))
                n_padding = audio_end - clip_audio.shape[1]
                padded_data = np.pad(clip_audio, ((0, 0), (0, n_padding)),
                                     mode='symmetric')
                sample_spectrogram = padded_data[:, audio_start:audio_end]
            else:
                sample_spectrogram = clip_audio[:, audio_start:audio_end]

            # raw audio
            audio_start = math.floor(start_idx / len(clip_skeleton) *
                                     len(clip_audio_raw))
            audio_end = audio_start + self.audio_sample_length
            if audio_end > len(
                    clip_audio_raw
            ):  # correct size mismatch between poses and audio
                # logging.info('expanding audio array, audio start={}, end={}, clip_length={}'.format(
                #     audio_start, audio_end, len(clip_audio_raw)))
                n_padding = audio_end - len(clip_audio_raw)
                padded_data = np.pad(clip_audio_raw, (0, n_padding),
                                     mode='symmetric')
                sample_audio = padded_data[audio_start:audio_end]
            else:
                sample_audio = clip_audio_raw[audio_start:audio_end]

            if len(sample_words) >= 2:
                # filtering motion skeleton data
                sample_skeletons, filtering_message = MotionPreprocessor(
                    sample_skeletons, self.mean_pose).get()
                is_correct_motion = (sample_skeletons != [])
                motion_info = {
                    'vid': vid,
                    'start_frame_no': clip_s_f + start_idx,
                    'end_frame_no': clip_s_f + fin_idx,
                    'start_time': subdivision_start_time,
                    'end_time': subdivision_end_time,
                    'is_correct_motion': is_correct_motion,
                    'filtering_message': filtering_message
                }

                if is_correct_motion or self.disable_filtering:
                    sample_skeletons_list.append(sample_skeletons)
                    sample_words_list.append(sample_words)
                    sample_audio_list.append(sample_audio)
                    sample_spectrogram_list.append(sample_spectrogram)
                    aux_info.append(motion_info)
                else:
                    n_filtered_out[filtering_message] += 1

        if len(sample_skeletons_list) > 0:
            with self.dst_lmdb_env.begin(write=True) as txn:
                for words, poses, audio, spectrogram, aux in zip(
                        sample_words_list, sample_skeletons_list,
                        sample_audio_list, sample_spectrogram_list, aux_info):
                    # preprocessing for poses
                    poses = np.asarray(poses)
                    dir_vec = utils.data_utils.convert_pose_seq_to_dir_vec(
                        poses)
                    normalized_dir_vec = self.normalize_dir_vec(
                        dir_vec, self.mean_dir_vec)

                    # save
                    k = '{:010}'.format(self.n_out_samples).encode('ascii')
                    v = [
                        words, poses, normalized_dir_vec, audio, spectrogram,
                        aux
                    ]
                    v = pyarrow.serialize(v).to_buffer()
                    txn.put(k, v)
                    self.n_out_samples += 1

        return n_filtered_out

예제 #41

0

파일 보기

파일: data_utils.py 프로젝트: zlfengyi/PytorchSR

        train_wav_files, test_wav_files = train_test_split(wav_files,
                                                           train_size=split,
                                                           random_state=1234)
        if 'train' in mode:
            wav_files = train_wav_files
        elif 'test' in mode:
            wav_files = test_wav_files
        else:
            NotImplementedError(
                'Other mode are not implemented in split mode! (mode: %s, split: %.4f)'
                % (mode, split))
    return wav_files


# serialize data function
serialize_data = lambda arr: pa.serialize(arr).to_buffer().to_pybytes()
deserialize_data = lambda bin_data: pa.deserialize(bin_data)


def load_arrow_file(file_path):
    with open(file_path, 'rb') as rb:
        return deserialize_data(rb.read())


def normalize_0_1(values, max, min):
    normalized = np.clip((values - min) / (max - min), 0, 1)
    return normalized


def denormalize_0_1(normalized, max, min):
    values = np.clip(normalized, 0, 1) * (max - min) + min