Exemplo n.º 1
0
def test_encode_non_scalar_type_is_passed(non_scalar_value):
    codec = ScalarCodec(FloatType())
    field = UnischemaField(name='field_float',
                           numpy_dtype=np.float32,
                           shape=(),
                           codec=codec,
                           nullable=False)
    with pytest.raises(TypeError, match='Expected a scalar'):
        codec.encode(field, non_scalar_value)
Exemplo n.º 2
0
def test_bad_encoded_data_shape():
    codec = ScalarCodec(IntegerType())
    field = UnischemaField(name='field_int',
                           numpy_dtype=np.int32,
                           shape=(),
                           codec=codec,
                           nullable=False)
    with pytest.raises(TypeError):
        codec.decode(field, codec.encode(field, np.asarray([10, 10])))
Exemplo n.º 3
0
def test_bad_unischema_field_shape():
    codec = ScalarCodec(IntegerType())
    field = UnischemaField(name='field_int',
                           numpy_dtype=np.int32,
                           shape=(1, ),
                           codec=codec,
                           nullable=False)
    with pytest.raises(ValueError, match='must be an empty tuple'):
        codec.encode(field, np.int32(1))
Exemplo n.º 4
0
def test_as_spark_schema_unspecified_codec_type_unknown_scalar_type_raises():
    """We have a limited list of scalar types we can automatically map from numpy (+Decimal) types to spark types.
    Make sure that a ValueError is raised if an unknown type is used."""
    TestSchema = Unischema('TestSchema', [
        UnischemaField('int_vector_unspecified_codec', object, ()),
    ])

    with pytest.raises(ValueError, match='Was not able to map type'):
        TestSchema.as_spark_schema()
Exemplo n.º 5
0
def test_bad_shape():
    codec = CompressedImageCodec('png')
    field = UnischemaField(name='field_image',
                           numpy_dtype=np.uint8,
                           shape=(10, 20),
                           codec=codec,
                           nullable=False)
    with pytest.raises(ValueError, match='Unexpected dimensions'):
        codec.encode(field, np.zeros((100, 200), dtype=np.uint8))
Exemplo n.º 6
0
def test_nested_value():
    codec = NoopCodec(ArrayType(ArrayType(StringType())))
    field = UnischemaField(name='field_string',
                           numpy_dtype=np.string_,
                           shape=(None, None),
                           codec=codec,
                           nullable=False)
    nested_array = [['a', 'b'], ['c'], ['d']]
    assert codec.decode(field, codec.encode(field,
                                            nested_array)) == nested_array
Exemplo n.º 7
0
def test_add_field_transform():
    one_added = transform_schema(
        TestSchema,
        TransformSpec(lambda x: x,
                      edit_fields=[
                          UnischemaField('double2', np.float64, (), None,
                                         False)
                      ]))
    assert set(
        one_added.fields.keys()) == {'string', 'double', 'double2', 'int'}
Exemplo n.º 8
0
def test_encode_scalar_bool():
    codec = ScalarCodec(BooleanType())
    field = UnischemaField(name='field_bool', numpy_dtype=np.bool, shape=(), codec=codec, nullable=False)

    encoded = codec.encode(field, np.bool_(True))
    assert isinstance(codec.encode(field, encoded), bool)
    assert encoded

    encoded = codec.encode(field, np.bool_(False))
    assert not encoded
Exemplo n.º 9
0
    def test_get_petastorm_column(self):
        col_name = 'frame_id'
        col = DataFrameColumn(col_name, ColumnType.INTEGER, False)
        petastorm_col = UnischemaField(col_name, np.int32, (),
                                       ScalarCodec(IntegerType()), False)
        self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col)

        col = DataFrameColumn(col_name, ColumnType.FLOAT, True)
        petastorm_col = UnischemaField(col_name, np.float64, (),
                                       ScalarCodec(FloatType()), True)
        self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col)

        col = DataFrameColumn(col_name, ColumnType.TEXT, False)
        petastorm_col = UnischemaField(col_name, np.str_, (),
                                       ScalarCodec(StringType()), False)
        self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col)

        col = DataFrameColumn(col_name, None, True, [10, 10])
        self.assertEqual(SchemaUtils.get_petastorm_column(col), None)
Exemplo n.º 10
0
def test_as_spark_schema():
    """Try using 'as_spark_schema' function"""
    TestSchema = Unischema('TestSchema', [
        UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False),
        UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
        UnischemaField('string_field_implicit', np.string_, ()),
    ])

    spark_schema = TestSchema.as_spark_schema()
    assert spark_schema.fields[0].name == 'int_field'

    assert spark_schema.fields[1].name == 'string_field'
    assert spark_schema.fields[1].dataType == StringType()

    assert spark_schema.fields[2].name == 'string_field_implicit'
    assert spark_schema.fields[2].dataType == StringType()

    assert TestSchema.fields['int_field'].name == 'int_field'
    assert TestSchema.fields['string_field'].name == 'string_field'
Exemplo n.º 11
0
def test_invalid_schema_field(synthetic_dataset, reader_factory):
    # Let's assume we are selecting columns using a schema which is different from the one
    # stored in the dataset. Would expect to get a reasonable error message
    BogusSchema = Unischema('BogusSchema', [
        UnischemaField('partition_key', np.string_,
                       (), ScalarCodec(StringType()), False),
        UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False),
        UnischemaField('bogus_key', np.int32,
                       (), ScalarCodec(ShortType()), False)
    ])

    expected_values = {'bogus_key': 11, 'id': 1}
    with pytest.raises(ValueError) as e:
        reader_factory(synthetic_dataset.url,
                       schema_fields=BogusSchema.fields.values(),
                       shuffle_row_groups=False,
                       predicate=EqualPredicate(expected_values))

    assert 'bogus_key' in str(e)
Exemplo n.º 12
0
def test_numeric_types(spark_numpy_types):
    spark_type, numpy_type = spark_numpy_types

    codec = ScalarCodec(spark_type())
    field = UnischemaField(name='field_int', numpy_dtype=numpy_type, shape=(), codec=codec, nullable=False)

    min_val, max_val = np.iinfo(numpy_type).min, np.iinfo(numpy_type).max

    assert codec.decode(field, codec.encode(field, numpy_type(min_val))) == min_val
    assert codec.decode(field, codec.encode(field, numpy_type(max_val))) == max_val
Exemplo n.º 13
0
def test_encode_scalar_int():
    codec = ScalarCodec(IntegerType())
    field = UnischemaField(name='field_int',
                           numpy_dtype=np.int32,
                           shape=(),
                           codec=codec,
                           nullable=False)
    encoded = codec.encode(field, np.int32(42))
    assert isinstance(encoded, int)
    assert 42 == encoded
Exemplo n.º 14
0
 def test_bad_dtype(self):
     codec = CompressedImageCodec('png')
     field = UnischemaField(name='field_image',
                            numpy_dtype=np.uint8,
                            shape=(10, 20),
                            codec=codec,
                            nullable=False)
     with self.assertRaises(ValueError) as e:
         codec.encode(field, np.zeros((100, 200), dtype=np.uint16))
     self.assertTrue('Unexpected type' in str(e.exception))
Exemplo n.º 15
0
def test_unicode():
    codec = ScalarCodec(StringType())
    field = UnischemaField(name='field_string',
                           numpy_dtype=np.unicode_,
                           shape=(),
                           codec=codec,
                           nullable=False)

    assert codec.decode(field, codec.encode(field, 'abc')) == 'abc'
    assert codec.decode(field, codec.encode(field, '')) == ''
Exemplo n.º 16
0
def test_scalar_codec_decimal():
    codec = ScalarCodec(DecimalType(4, 3))
    field = UnischemaField(name='field_decimal',
                           numpy_dtype=Decimal,
                           shape=(),
                           codec=codec,
                           nullable=False)

    value = Decimal('123.4567')
    assert codec.decode(field, codec.encode(field, value)) == value
Exemplo n.º 17
0
def test_predicate_on_partitioned_dataset(tmpdir):
    """
    Generates a partitioned dataset and ensures that readers evaluate the type of the partition
    column according to the type given in the Unischema.
    """
    TestSchema = Unischema('TestSchema', [
        UnischemaField('id', np.int32, (), ScalarCodec(IntegerType()), False),
        UnischemaField('test_field', np.int32,
                       (), ScalarCodec(IntegerType()), False),
    ])

    def test_row_generator(x):
        """Returns a single entry in the generated dataset."""
        return {'id': x, 'test_field': x * x}

    rowgroup_size_mb = 256
    dataset_url = "file://{0}/partitioned_test_dataset".format(tmpdir)

    spark = SparkSession.builder.config('spark.driver.memory',
                                        '2g').master('local[2]').getOrCreate()
    sc = spark.sparkContext

    rows_count = 10
    with materialize_dataset(spark, dataset_url, TestSchema, rowgroup_size_mb):

        rows_rdd = sc.parallelize(range(rows_count))\
            .map(test_row_generator)\
            .map(lambda x: dict_to_spark_row(TestSchema, x))

        spark.createDataFrame(rows_rdd, TestSchema.as_spark_schema()) \
            .write \
            .partitionBy('id') \
            .parquet(dataset_url)

    with make_reader(dataset_url,
                     predicate=in_lambda(['id'], lambda x: x == 3)) as reader:
        assert next(reader).id == 3
    with make_reader(dataset_url,
                     predicate=in_lambda(['id'],
                                         lambda x: x == '3')) as reader:
        with pytest.raises(StopIteration):
            # Predicate should have selected none, so a StopIteration should be raised.
            next(reader)
Exemplo n.º 18
0
def test_decode_numpy_scalar_when_codec_is_none():
    """Decoding a row that has a field with the codec set to None. The type should be deduced automatically
    from UnischemaField's numpy_dtype attribute"""

    MatrixSchema = Unischema('TestSchema',
                             [UnischemaField('scalar', np.float64, ())])
    row = {'scalar': 42.0}
    decoded_value = decode_row(row, MatrixSchema)['scalar']
    assert decoded_value == 42
    assert isinstance(decoded_value, np.float64)
Exemplo n.º 19
0
def test_transform_spec_support_return_tensor(scalar_dataset, reader_factory):
    field1 = UnischemaField(name='abc', shape=(2, 3), numpy_dtype=np.float32)

    with pytest.raises(ValueError, match='field abc must be numpy array type'):
        ArrowReaderWorker._check_shape_and_ravel('xyz', field1)

    with pytest.raises(ValueError, match='field abc must be the shape'):
        ArrowReaderWorker._check_shape_and_ravel(np.zeros((2, 5)), field1)

    with pytest.raises(ValueError, match='field abc error: only support row major multi-dimensional array'):
        ArrowReaderWorker._check_shape_and_ravel(np.zeros((2, 3), order='F'), field1)

    assert (6,) == ArrowReaderWorker._check_shape_and_ravel(np.zeros((2, 3)), field1).shape

    for partial_shape in [(2, None), (None,), (None, None)]:
        field_with_unknown_dim = UnischemaField(name='abc', shape=partial_shape, numpy_dtype=np.float32)
        with pytest.raises(ValueError, match='All dimensions of a shape.*must be constant'):
            ArrowReaderWorker._check_shape_and_ravel(np.zeros((2, 3), order='F'), field_with_unknown_dim)

    def preproc_fn1(x):
        return pd.DataFrame({
            'tensor_col_1': x['id'].map(lambda _: np.random.rand(2, 3)),
            'tensor_col_2': x['id'].map(lambda _: np.random.rand(3, 4, 5)),
        })

    edit_fields = [
        ('tensor_col_1', np.float32, (2, 3), False),
        ('tensor_col_2', np.float32, (3, 4, 5), False),
    ]

    # This spec will remove all input columns and return one new column 'tensor_col_1' with shape (2, 3)
    spec1 = TransformSpec(
        preproc_fn1,
        edit_fields=edit_fields,
        removed_fields=list(scalar_dataset.data[0].keys())
    )

    with reader_factory(scalar_dataset.url, transform_spec=spec1) as reader:
        sample = next(reader)._asdict()
        assert len(sample) == 2
        assert (2, 3) == sample['tensor_col_1'].shape[1:] and \
               (3, 4, 5) == sample['tensor_col_2'].shape[1:]
Exemplo n.º 20
0
def test_encode_scalar_float():
    codec = ScalarCodec(FloatType())
    expected = np.random.random(()).astype(np.float64)
    field = UnischemaField(name='field_float',
                           numpy_dtype=np.float32,
                           shape=(),
                           codec=codec,
                           nullable=False)
    encoded = codec.encode(field, expected)
    assert isinstance(encoded, float)
    assert expected == encoded
Exemplo n.º 21
0
def test_compressed_ndarray_codec():
    SHAPE = (10, 20, 30)
    expected = np.random.rand(*SHAPE).astype(dtype=np.int32)
    codec = CompressedNdarrayCodec()
    field = UnischemaField(name='test_name',
                           numpy_dtype=np.int32,
                           shape=SHAPE,
                           codec=CompressedNdarrayCodec(),
                           nullable=False)
    np.testing.assert_equal(codec.decode(field, codec.encode(field, expected)),
                            expected)
Exemplo n.º 22
0
def test_match_unischema_fields_legacy_warning():
    TestSchema = Unischema('TestSchema', [
        UnischemaField('int32', np.int32, (), None, False),
        UnischemaField('uint8', np.uint8, (), None, False),
        UnischemaField('uint16', np.uint16, (), None, False),
    ])

    # Check that no warnings are shown if the legacy and the new way of filtering produce the same results.
    with pytest.warns(None) as unexpected_warnings:
        match_unischema_fields(TestSchema, ['uint8'])
    assert not unexpected_warnings

    # uint8 and uint16 would have been matched using the old method, but not the new one
    with pytest.warns(UserWarning, match=r'schema_fields behavior has changed.*uint16, uint8'):
        assert match_unischema_fields(TestSchema, ['uint']) == []

    # Now, all fields will be matched, but in different order (legacy vs current). Make sure we don't issue a warning.
    with pytest.warns(None) as unexpected_warnings:
        match_unischema_fields(TestSchema, ['int', 'uint8', 'uint16', 'int32'])
    assert not unexpected_warnings
Exemplo n.º 23
0
    def test_scalar_codec_unicode(self):
        codec = ScalarCodec(StringType())
        field = UnischemaField(name='field_string',
                               numpy_dtype=np.unicode_,
                               shape=(),
                               codec=codec,
                               nullable=False)

        self.assertEqual(codec.decode(field, codec.encode(field, 'abc')),
                         'abc')
        self.assertEqual(codec.decode(field, codec.encode(field, '')), '')
Exemplo n.º 24
0
def test_encode_scalar_string():
    codec = ScalarCodec(StringType())
    expected = 'surprise'
    field = UnischemaField(name='field_string',
                           numpy_dtype=np.unicode_,
                           shape=(),
                           codec=codec,
                           nullable=False)
    encoded = codec.encode(field, expected)
    assert isinstance(encoded, str)
    assert expected == encoded
Exemplo n.º 25
0
def test_decode_numpy_scalar_with_explicit_scalar_codec():
    """Decoding a row that has a field with the codec set explicitly"""

    MatrixSchema = Unischema('TestSchema', [
        UnischemaField('scalar', np.float64,
                       (), ScalarCodec(DoubleType()), False)
    ])
    row = {'scalar': 42.0}
    decoded_value = decode_row(row, MatrixSchema)['scalar']
    assert decoded_value == 42
    assert isinstance(decoded_value, np.float64)
Exemplo n.º 26
0
 def test_get_petastorm_column_ndarray(self):
     expected_type = [np.int8, np.uint8, np.int16, np.int32, np.int64,
                      np.unicode_, np.bool_, np.float32, np.float64,
                      Decimal, np.str_, np.datetime64]
     col_name = 'frame_id'
     for array_type, np_type in zip(NdArrayType, expected_type):
         col = DataFrameColumn(col_name, ColumnType.NDARRAY, True,
                               array_type, [10, 10])
         petastorm_col = UnischemaField(col_name, np_type, [10, 10],
                                        NdarrayCodec(), True)
         self.assertEqual(SchemaUtils.get_petastorm_column(col),
                          petastorm_col)
Exemplo n.º 27
0
 def setUpClass(cls):
     cls._TestField1a = UnischemaField('random', np.string_, (), ScalarCodec(StringType()), False)
     cls._TestField1b = UnischemaField('random', np.string_, (), ScalarCodec(StringType()), False)
     cls._TestField1c = UnischemaField('Random', np.string_, (), ScalarCodec(StringType()), False)
     cls._TestField2a = UnischemaField('id', np.int32, (), ScalarCodec(ShortType()), False)
     cls._TestField2b = UnischemaField('id', np.int32, (), ScalarCodec(ShortType()), False)
     cls._TestField2c = UnischemaField('ID', np.int32, (), ScalarCodec(ShortType()), False)
Exemplo n.º 28
0
def _test_scalar_type(spark_type, numpy_type, bits):
    codec = ScalarCodec(spark_type())
    field = UnischemaField(name='field_int',
                           numpy_dtype=numpy_type,
                           shape=(),
                           codec=codec,
                           nullable=False)

    min_val, max_val = -2**(bits - 1), 2**(bits - 1) - 1
    assert codec.decode(field, codec.encode(field,
                                            numpy_type(min_val))) == min_val
    assert codec.decode(field, codec.encode(field,
                                            numpy_type(max_val))) == max_val
Exemplo n.º 29
0
def test_ndarray_codec(codec_factory):
    SHAPE = (10, 20, 3)
    for dtype in NUMERIC_DTYPES:
        expected = np.random.rand(*SHAPE).astype(dtype=dtype)
        codec = codec_factory()
        field = UnischemaField(name='test_name',
                               numpy_dtype=dtype,
                               shape=SHAPE,
                               codec=codec,
                               nullable=False)
        actual = codec.decode(field, codec.encode(field, expected))
        np.testing.assert_equal(actual, expected)
        assert expected.dtype == actual.dtype
Exemplo n.º 30
0
def test_use_persisted_codec_and_not_provided_by_user(synthetic_dataset,
                                                      reader_factory):
    """In order to start using new codec for some field while maintain the ability to read old datasets that were
    written using an old codec, we need to make sure we are using stored UnischemaField.codec object (that contains
    an old codec/shape)."""
    new_unischema_instance = UnischemaField('matrix_uint16', np.uint16,
                                            (2, 3, 4),
                                            CompressedImageCodec('png'), False)

    with reader_factory(synthetic_dataset.url,
                        schema_fields=[new_unischema_instance]) as reader:
        row = next(reader)
    assert row.matrix_uint16.shape == (32, 16, 3)