Exemplos de UnischemaField em Python, exemplos de petastorm.unischema.UnischemaField em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: test_codec_scalar.py Projeto: voganrc/petastorm

def test_encode_non_scalar_type_is_passed(non_scalar_value):
    codec = ScalarCodec(FloatType())
    field = UnischemaField(name='field_float',
                           numpy_dtype=np.float32,
                           shape=(),
                           codec=codec,
                           nullable=False)
    with pytest.raises(TypeError, match='Expected a scalar'):
        codec.encode(field, non_scalar_value)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: test_codec_scalar.py Projeto: voganrc/petastorm

def test_bad_encoded_data_shape():
    codec = ScalarCodec(IntegerType())
    field = UnischemaField(name='field_int',
                           numpy_dtype=np.int32,
                           shape=(),
                           codec=codec,
                           nullable=False)
    with pytest.raises(TypeError):
        codec.decode(field, codec.encode(field, np.asarray([10, 10])))

Exemplo n.º 3

0

Exibir arquivo

Arquivo: test_codec_scalar.py Projeto: voganrc/petastorm

def test_bad_unischema_field_shape():
    codec = ScalarCodec(IntegerType())
    field = UnischemaField(name='field_int',
                           numpy_dtype=np.int32,
                           shape=(1, ),
                           codec=codec,
                           nullable=False)
    with pytest.raises(ValueError, match='must be an empty tuple'):
        codec.encode(field, np.int32(1))

Exemplo n.º 4

0

Exibir arquivo

def test_as_spark_schema_unspecified_codec_type_unknown_scalar_type_raises():
    """We have a limited list of scalar types we can automatically map from numpy (+Decimal) types to spark types.
    Make sure that a ValueError is raised if an unknown type is used."""
    TestSchema = Unischema('TestSchema', [
        UnischemaField('int_vector_unspecified_codec', object, ()),
    ])

    with pytest.raises(ValueError, match='Was not able to map type'):
        TestSchema.as_spark_schema()

Exemplo n.º 5

0

Exibir arquivo

def test_bad_shape():
    codec = CompressedImageCodec('png')
    field = UnischemaField(name='field_image',
                           numpy_dtype=np.uint8,
                           shape=(10, 20),
                           codec=codec,
                           nullable=False)
    with pytest.raises(ValueError, match='Unexpected dimensions'):
        codec.encode(field, np.zeros((100, 200), dtype=np.uint8))

Exemplo n.º 6

0

Exibir arquivo

Arquivo: test_codec_noop.py Projeto: mike-grayhat/petastorm

def test_nested_value():
    codec = NoopCodec(ArrayType(ArrayType(StringType())))
    field = UnischemaField(name='field_string',
                           numpy_dtype=np.string_,
                           shape=(None, None),
                           codec=codec,
                           nullable=False)
    nested_array = [['a', 'b'], ['c'], ['d']]
    assert codec.decode(field, codec.encode(field,
                                            nested_array)) == nested_array

Exemplo n.º 7

0

Exibir arquivo

Arquivo: test_transform.py Projeto: xiaohanhuang/petastorm

def test_add_field_transform():
    one_added = transform_schema(
        TestSchema,
        TransformSpec(lambda x: x,
                      edit_fields=[
                          UnischemaField('double2', np.float64, (), None,
                                         False)
                      ]))
    assert set(
        one_added.fields.keys()) == {'string', 'double', 'double2', 'int'}

Exemplo n.º 8

0

Exibir arquivo

def test_encode_scalar_bool():
    codec = ScalarCodec(BooleanType())
    field = UnischemaField(name='field_bool', numpy_dtype=np.bool, shape=(), codec=codec, nullable=False)

    encoded = codec.encode(field, np.bool_(True))
    assert isinstance(codec.encode(field, encoded), bool)
    assert encoded

    encoded = codec.encode(field, np.bool_(False))
    assert not encoded

Exemplo n.º 9

0

Exibir arquivo

    def test_get_petastorm_column(self):
        col_name = 'frame_id'
        col = DataFrameColumn(col_name, ColumnType.INTEGER, False)
        petastorm_col = UnischemaField(col_name, np.int32, (),
                                       ScalarCodec(IntegerType()), False)
        self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col)

        col = DataFrameColumn(col_name, ColumnType.FLOAT, True)
        petastorm_col = UnischemaField(col_name, np.float64, (),
                                       ScalarCodec(FloatType()), True)
        self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col)

        col = DataFrameColumn(col_name, ColumnType.TEXT, False)
        petastorm_col = UnischemaField(col_name, np.str_, (),
                                       ScalarCodec(StringType()), False)
        self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col)

        col = DataFrameColumn(col_name, None, True, [10, 10])
        self.assertEqual(SchemaUtils.get_petastorm_column(col), None)

Exemplo n.º 10

0

Exibir arquivo

Arquivo: test_unischema.py Projeto: xiaohanhuang/petastorm

def test_as_spark_schema():
    """Try using 'as_spark_schema' function"""
    TestSchema = Unischema('TestSchema', [
        UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False),
        UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
        UnischemaField('string_field_implicit', np.string_, ()),
    ])

    spark_schema = TestSchema.as_spark_schema()
    assert spark_schema.fields[0].name == 'int_field'

    assert spark_schema.fields[1].name == 'string_field'
    assert spark_schema.fields[1].dataType == StringType()

    assert spark_schema.fields[2].name == 'string_field_implicit'
    assert spark_schema.fields[2].dataType == StringType()

    assert TestSchema.fields['int_field'].name == 'int_field'
    assert TestSchema.fields['string_field'].name == 'string_field'

Exemplo n.º 11

0

Exibir arquivo

def test_invalid_schema_field(synthetic_dataset, reader_factory):
    # Let's assume we are selecting columns using a schema which is different from the one
    # stored in the dataset. Would expect to get a reasonable error message
    BogusSchema = Unischema('BogusSchema', [
        UnischemaField('partition_key', np.string_,
                       (), ScalarCodec(StringType()), False),
        UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False),
        UnischemaField('bogus_key', np.int32,
                       (), ScalarCodec(ShortType()), False)
    ])

    expected_values = {'bogus_key': 11, 'id': 1}
    with pytest.raises(ValueError) as e:
        reader_factory(synthetic_dataset.url,
                       schema_fields=BogusSchema.fields.values(),
                       shuffle_row_groups=False,
                       predicate=EqualPredicate(expected_values))

    assert 'bogus_key' in str(e)

Exemplo n.º 12

0

Exibir arquivo

Arquivo: test_codec_scalar.py Projeto: wxrui/petastorm

def test_numeric_types(spark_numpy_types):
    spark_type, numpy_type = spark_numpy_types

    codec = ScalarCodec(spark_type())
    field = UnischemaField(name='field_int', numpy_dtype=numpy_type, shape=(), codec=codec, nullable=False)

    min_val, max_val = np.iinfo(numpy_type).min, np.iinfo(numpy_type).max

    assert codec.decode(field, codec.encode(field, numpy_type(min_val))) == min_val
    assert codec.decode(field, codec.encode(field, numpy_type(max_val))) == max_val

Exemplo n.º 13

0

Exibir arquivo

Arquivo: test_codec_scalar.py Projeto: zhangruiskyline/petastorm

def test_encode_scalar_int():
    codec = ScalarCodec(IntegerType())
    field = UnischemaField(name='field_int',
                           numpy_dtype=np.int32,
                           shape=(),
                           codec=codec,
                           nullable=False)
    encoded = codec.encode(field, np.int32(42))
    assert isinstance(encoded, int)
    assert 42 == encoded

Exemplo n.º 14

0

Exibir arquivo

Arquivo: test_codecs.py Projeto: meremeev/petastorm

 def test_bad_dtype(self):
     codec = CompressedImageCodec('png')
     field = UnischemaField(name='field_image',
                            numpy_dtype=np.uint8,
                            shape=(10, 20),
                            codec=codec,
                            nullable=False)
     with self.assertRaises(ValueError) as e:
         codec.encode(field, np.zeros((100, 200), dtype=np.uint16))
     self.assertTrue('Unexpected type' in str(e.exception))

Exemplo n.º 15

0

Exibir arquivo

Arquivo: test_codec_scalar.py Projeto: zhangruiskyline/petastorm

def test_unicode():
    codec = ScalarCodec(StringType())
    field = UnischemaField(name='field_string',
                           numpy_dtype=np.unicode_,
                           shape=(),
                           codec=codec,
                           nullable=False)

    assert codec.decode(field, codec.encode(field, 'abc')) == 'abc'
    assert codec.decode(field, codec.encode(field, '')) == ''

Exemplo n.º 16

0

Exibir arquivo

Arquivo: test_codec_scalar.py Projeto: zhangruiskyline/petastorm

def test_scalar_codec_decimal():
    codec = ScalarCodec(DecimalType(4, 3))
    field = UnischemaField(name='field_decimal',
                           numpy_dtype=Decimal,
                           shape=(),
                           codec=codec,
                           nullable=False)

    value = Decimal('123.4567')
    assert codec.decode(field, codec.encode(field, value)) == value

Exemplo n.º 17

0

Exibir arquivo

Arquivo: test_predicates.py Projeto: zhangruiskyline/petastorm

def test_predicate_on_partitioned_dataset(tmpdir):
    """
    Generates a partitioned dataset and ensures that readers evaluate the type of the partition
    column according to the type given in the Unischema.
    """
    TestSchema = Unischema('TestSchema', [
        UnischemaField('id', np.int32, (), ScalarCodec(IntegerType()), False),
        UnischemaField('test_field', np.int32,
                       (), ScalarCodec(IntegerType()), False),
    ])

    def test_row_generator(x):
        """Returns a single entry in the generated dataset."""
        return {'id': x, 'test_field': x * x}

    rowgroup_size_mb = 256
    dataset_url = "file://{0}/partitioned_test_dataset".format(tmpdir)

    spark = SparkSession.builder.config('spark.driver.memory',
                                        '2g').master('local[2]').getOrCreate()
    sc = spark.sparkContext

    rows_count = 10
    with materialize_dataset(spark, dataset_url, TestSchema, rowgroup_size_mb):

        rows_rdd = sc.parallelize(range(rows_count))\
            .map(test_row_generator)\
            .map(lambda x: dict_to_spark_row(TestSchema, x))

        spark.createDataFrame(rows_rdd, TestSchema.as_spark_schema()) \
            .write \
            .partitionBy('id') \
            .parquet(dataset_url)

    with make_reader(dataset_url,
                     predicate=in_lambda(['id'], lambda x: x == 3)) as reader:
        assert next(reader).id == 3
    with make_reader(dataset_url,
                     predicate=in_lambda(['id'],
                                         lambda x: x == '3')) as reader:
        with pytest.raises(StopIteration):
            # Predicate should have selected none, so a StopIteration should be raised.
            next(reader)

Exemplo n.º 18

0

Exibir arquivo

Arquivo: test_decode_row.py Projeto: zhangruiskyline/petastorm

def test_decode_numpy_scalar_when_codec_is_none():
    """Decoding a row that has a field with the codec set to None. The type should be deduced automatically
    from UnischemaField's numpy_dtype attribute"""

    MatrixSchema = Unischema('TestSchema',
                             [UnischemaField('scalar', np.float64, ())])
    row = {'scalar': 42.0}
    decoded_value = decode_row(row, MatrixSchema)['scalar']
    assert decoded_value == 42
    assert isinstance(decoded_value, np.float64)

Exemplo n.º 19

0

Exibir arquivo

def test_transform_spec_support_return_tensor(scalar_dataset, reader_factory):
    field1 = UnischemaField(name='abc', shape=(2, 3), numpy_dtype=np.float32)

    with pytest.raises(ValueError, match='field abc must be numpy array type'):
        ArrowReaderWorker._check_shape_and_ravel('xyz', field1)

    with pytest.raises(ValueError, match='field abc must be the shape'):
        ArrowReaderWorker._check_shape_and_ravel(np.zeros((2, 5)), field1)

    with pytest.raises(ValueError, match='field abc error: only support row major multi-dimensional array'):
        ArrowReaderWorker._check_shape_and_ravel(np.zeros((2, 3), order='F'), field1)

    assert (6,) == ArrowReaderWorker._check_shape_and_ravel(np.zeros((2, 3)), field1).shape

    for partial_shape in [(2, None), (None,), (None, None)]:
        field_with_unknown_dim = UnischemaField(name='abc', shape=partial_shape, numpy_dtype=np.float32)
        with pytest.raises(ValueError, match='All dimensions of a shape.*must be constant'):
            ArrowReaderWorker._check_shape_and_ravel(np.zeros((2, 3), order='F'), field_with_unknown_dim)

    def preproc_fn1(x):
        return pd.DataFrame({
            'tensor_col_1': x['id'].map(lambda _: np.random.rand(2, 3)),
            'tensor_col_2': x['id'].map(lambda _: np.random.rand(3, 4, 5)),
        })

    edit_fields = [
        ('tensor_col_1', np.float32, (2, 3), False),
        ('tensor_col_2', np.float32, (3, 4, 5), False),
    ]

    # This spec will remove all input columns and return one new column 'tensor_col_1' with shape (2, 3)
    spec1 = TransformSpec(
        preproc_fn1,
        edit_fields=edit_fields,
        removed_fields=list(scalar_dataset.data[0].keys())
    )

    with reader_factory(scalar_dataset.url, transform_spec=spec1) as reader:
        sample = next(reader)._asdict()
        assert len(sample) == 2
        assert (2, 3) == sample['tensor_col_1'].shape[1:] and \
               (3, 4, 5) == sample['tensor_col_2'].shape[1:]

Exemplo n.º 20

0

Exibir arquivo

Arquivo: test_codec_scalar.py Projeto: zhangruiskyline/petastorm

def test_encode_scalar_float():
    codec = ScalarCodec(FloatType())
    expected = np.random.random(()).astype(np.float64)
    field = UnischemaField(name='field_float',
                           numpy_dtype=np.float32,
                           shape=(),
                           codec=codec,
                           nullable=False)
    encoded = codec.encode(field, expected)
    assert isinstance(encoded, float)
    assert expected == encoded

Exemplo n.º 21

0

Exibir arquivo

Arquivo: test_codecs.py Projeto: aabbcc23/petastorm

def test_compressed_ndarray_codec():
    SHAPE = (10, 20, 30)
    expected = np.random.rand(*SHAPE).astype(dtype=np.int32)
    codec = CompressedNdarrayCodec()
    field = UnischemaField(name='test_name',
                           numpy_dtype=np.int32,
                           shape=SHAPE,
                           codec=CompressedNdarrayCodec(),
                           nullable=False)
    np.testing.assert_equal(codec.decode(field, codec.encode(field, expected)),
                            expected)

Exemplo n.º 22

0

Exibir arquivo

Arquivo: test_unischema.py Projeto: xiaohanhuang/petastorm

def test_match_unischema_fields_legacy_warning():
    TestSchema = Unischema('TestSchema', [
        UnischemaField('int32', np.int32, (), None, False),
        UnischemaField('uint8', np.uint8, (), None, False),
        UnischemaField('uint16', np.uint16, (), None, False),
    ])

    # Check that no warnings are shown if the legacy and the new way of filtering produce the same results.
    with pytest.warns(None) as unexpected_warnings:
        match_unischema_fields(TestSchema, ['uint8'])
    assert not unexpected_warnings

    # uint8 and uint16 would have been matched using the old method, but not the new one
    with pytest.warns(UserWarning, match=r'schema_fields behavior has changed.*uint16, uint8'):
        assert match_unischema_fields(TestSchema, ['uint']) == []

    # Now, all fields will be matched, but in different order (legacy vs current). Make sure we don't issue a warning.
    with pytest.warns(None) as unexpected_warnings:
        match_unischema_fields(TestSchema, ['int', 'uint8', 'uint16', 'int32'])
    assert not unexpected_warnings

Exemplo n.º 23

0

Exibir arquivo

Arquivo: test_codecs.py Projeto: meremeev/petastorm

    def test_scalar_codec_unicode(self):
        codec = ScalarCodec(StringType())
        field = UnischemaField(name='field_string',
                               numpy_dtype=np.unicode_,
                               shape=(),
                               codec=codec,
                               nullable=False)

        self.assertEqual(codec.decode(field, codec.encode(field, 'abc')),
                         'abc')
        self.assertEqual(codec.decode(field, codec.encode(field, '')), '')

Exemplo n.º 24

0

Exibir arquivo

Arquivo: test_codec_scalar.py Projeto: zhangruiskyline/petastorm

def test_encode_scalar_string():
    codec = ScalarCodec(StringType())
    expected = 'surprise'
    field = UnischemaField(name='field_string',
                           numpy_dtype=np.unicode_,
                           shape=(),
                           codec=codec,
                           nullable=False)
    encoded = codec.encode(field, expected)
    assert isinstance(encoded, str)
    assert expected == encoded

Exemplo n.º 25

0

Exibir arquivo

Arquivo: test_decode_row.py Projeto: zhangruiskyline/petastorm

def test_decode_numpy_scalar_with_explicit_scalar_codec():
    """Decoding a row that has a field with the codec set explicitly"""

    MatrixSchema = Unischema('TestSchema', [
        UnischemaField('scalar', np.float64,
                       (), ScalarCodec(DoubleType()), False)
    ])
    row = {'scalar': 42.0}
    decoded_value = decode_row(row, MatrixSchema)['scalar']
    assert decoded_value == 42
    assert isinstance(decoded_value, np.float64)

Exemplo n.º 26

0

Exibir arquivo

Arquivo: test_schema.py Projeto: georgia-tech-db/eva

 def test_get_petastorm_column_ndarray(self):
     expected_type = [np.int8, np.uint8, np.int16, np.int32, np.int64,
                      np.unicode_, np.bool_, np.float32, np.float64,
                      Decimal, np.str_, np.datetime64]
     col_name = 'frame_id'
     for array_type, np_type in zip(NdArrayType, expected_type):
         col = DataFrameColumn(col_name, ColumnType.NDARRAY, True,
                               array_type, [10, 10])
         petastorm_col = UnischemaField(col_name, np_type, [10, 10],
                                        NdarrayCodec(), True)
         self.assertEqual(SchemaUtils.get_petastorm_column(col),
                          petastorm_col)

Exemplo n.º 27

0

Exibir arquivo

 def setUpClass(cls):
     cls._TestField1a = UnischemaField('random', np.string_, (), ScalarCodec(StringType()), False)
     cls._TestField1b = UnischemaField('random', np.string_, (), ScalarCodec(StringType()), False)
     cls._TestField1c = UnischemaField('Random', np.string_, (), ScalarCodec(StringType()), False)
     cls._TestField2a = UnischemaField('id', np.int32, (), ScalarCodec(ShortType()), False)
     cls._TestField2b = UnischemaField('id', np.int32, (), ScalarCodec(ShortType()), False)
     cls._TestField2c = UnischemaField('ID', np.int32, (), ScalarCodec(ShortType()), False)

Exemplo n.º 28

0

Exibir arquivo

Arquivo: test_codecs.py Projeto: aabbcc23/petastorm

def _test_scalar_type(spark_type, numpy_type, bits):
    codec = ScalarCodec(spark_type())
    field = UnischemaField(name='field_int',
                           numpy_dtype=numpy_type,
                           shape=(),
                           codec=codec,
                           nullable=False)

    min_val, max_val = -2**(bits - 1), 2**(bits - 1) - 1
    assert codec.decode(field, codec.encode(field,
                                            numpy_type(min_val))) == min_val
    assert codec.decode(field, codec.encode(field,
                                            numpy_type(max_val))) == max_val

Exemplo n.º 29

0

Exibir arquivo

Arquivo: test_codec_ndarray.py Projeto: voganrc/petastorm

def test_ndarray_codec(codec_factory):
    SHAPE = (10, 20, 3)
    for dtype in NUMERIC_DTYPES:
        expected = np.random.rand(*SHAPE).astype(dtype=dtype)
        codec = codec_factory()
        field = UnischemaField(name='test_name',
                               numpy_dtype=dtype,
                               shape=SHAPE,
                               codec=codec,
                               nullable=False)
        actual = codec.decode(field, codec.encode(field, expected))
        np.testing.assert_equal(actual, expected)
        assert expected.dtype == actual.dtype

Exemplo n.º 30

0

Exibir arquivo

Arquivo: test_end_to_end.py Projeto: lunarbridge/petastorm

def test_use_persisted_codec_and_not_provided_by_user(synthetic_dataset,
                                                      reader_factory):
    """In order to start using new codec for some field while maintain the ability to read old datasets that were
    written using an old codec, we need to make sure we are using stored UnischemaField.codec object (that contains
    an old codec/shape)."""
    new_unischema_instance = UnischemaField('matrix_uint16', np.uint16,
                                            (2, 3, 4),
                                            CompressedImageCodec('png'), False)

    with reader_factory(synthetic_dataset.url,
                        schema_fields=[new_unischema_instance]) as reader:
        row = next(reader)
    assert row.matrix_uint16.shape == (32, 16, 3)