def test_normal_schema_read_with_fastparquet(): with _test_utils.LocalTestFileSystem(): a = _schema_impl.Schema.create_at_any_location( schema_type=_schema_impl.SchemaType([("a", _primitives.Integer), ("b", _primitives.Boolean)]) ) with a as writer: writer.write(_pd.DataFrame.from_dict({"a": [1, 2, 3, 4], "b": [False, True, True, False]})) import os as _os original_engine = _os.getenv("PARQUET_ENGINE") _os.environ["PARQUET_ENGINE"] = "fastparquet" b = _schema_impl.Schema.fetch(a.remote_prefix, schema_type=_schema_impl.SchemaType([])) with b as reader: df = reader.read() assert df["a"].tolist() == [1, 2, 3, 4] assert _pd.api.types.is_bool_dtype(df.dtypes["b"]) assert df["b"].tolist() == [False, True, True, False] if original_engine is None: del _os.environ["PARQUET_ENGINE"] else: _os.environ["PARQUET_ENGINE"] = original_engine
def test_extra_schema_read(): with _test_utils.LocalTestFileSystem(): a = _schema_impl.Schema.create_at_any_location( schema_type=_schema_impl.SchemaType([( 'a', _primitives.Integer), ('b', _primitives.Integer)])) with a as writer: writer.write( _pd.DataFrame.from_dict({ 'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8] })) b = _schema_impl.Schema.fetch(a.remote_prefix, schema_type=_schema_impl.SchemaType([ ('a', _primitives.Integer) ])) with b as reader: df = reader.read(concat=True, truncate_extra_columns=False) assert df.columns.values.tolist() == ['a', 'b'] assert df['a'].tolist() == [1, 2, 3, 4] assert df['b'].tolist() == [5, 6, 7, 8] with b as reader: df = reader.read(concat=True) assert df.columns.values.tolist() == ['a'] assert df['a'].tolist() == [1, 2, 3, 4]
def test_schema_read_consistency_between_two_engines(): with _test_utils.LocalTestFileSystem(): a = _schema_impl.Schema.create_at_any_location( schema_type=_schema_impl.SchemaType([( 'a', _primitives.Integer), ('b', _primitives.Boolean)])) with a as writer: writer.write( _pd.DataFrame.from_dict({ 'a': [1, 2, 3, 4], 'b': [None, True, None, False] })) import os as _os original_engine = _os.getenv('PARQUET_ENGINE') _os.environ['PARQUET_ENGINE'] = 'fastparquet' b = _schema_impl.Schema.fetch(a.remote_prefix, schema_type=_schema_impl.SchemaType([])) with b as b_reader: b_df = b_reader.read() _os.environ['PARQUET_ENGINE'] = 'pyarrow' c = _schema_impl.Schema.fetch(a.remote_prefix, schema_type=_schema_impl.SchemaType( [])) with c as c_reader: c_df = c_reader.read() assert b_df.equals(c_df) if original_engine is None: del _os.environ['PARQUET_ENGINE'] else: _os.environ['PARQUET_ENGINE'] = original_engine
def test_type_promoted_schema_read_with_fastparquet(): with _test_utils.LocalTestFileSystem(): a = _schema_impl.Schema.create_at_any_location( schema_type=_schema_impl.SchemaType([( 'a', _primitives.Integer), ('b', _primitives.Boolean)])) with a as writer: writer.write( _pd.DataFrame.from_dict({ 'a': [1, 2, 3, 4], 'b': [None, True, None, False] })) import os as _os original_engine = _os.getenv('PARQUET_ENGINE') _os.environ['PARQUET_ENGINE'] = 'fastparquet' b = _schema_impl.Schema.fetch(a.remote_prefix, schema_type=_schema_impl.SchemaType([])) with b as reader: df = reader.read() assert df['a'].tolist() == [1, 2, 3, 4] assert _pd.api.types.is_object_dtype(df.dtypes['b']) assert df['b'].tolist() == [None, True, None, False] if original_engine is None: del _os.environ['PARQUET_ENGINE'] else: _os.environ['PARQUET_ENGINE'] = original_engine
def test_schema_type(): _schema_impl.SchemaType() _schema_impl.SchemaType([]) _schema_impl.SchemaType( [ ("a", _primitives.Integer), ("b", _primitives.String), ("c", _primitives.Float), ("d", _primitives.Boolean), ("e", _primitives.Datetime), ] ) with _pytest.raises(ValueError): _schema_impl.SchemaType({"a": _primitives.Integer}) with _pytest.raises(TypeError): _schema_impl.SchemaType([("a", _blobs.Blob)]) with _pytest.raises(ValueError): _schema_impl.SchemaType([("a", _primitives.Integer, 1)]) _schema_impl.SchemaType([("1", _primitives.Integer)]) with _pytest.raises(TypeError): _schema_impl.SchemaType([(1, _primitives.Integer)]) with _pytest.raises(TypeError): _schema_impl.SchemaType([("1", [_primitives.Integer])])
def empty_list(): s = _schema_impl.Schema.from_python_std( t_value=[], schema_type=_schema_impl.SchemaType([("a", _primitives.Integer), ("b", _primitives.Integer)]), ) assert s is not None n = _schema_impl.Schema.fetch( s.uri, schema_type=_schema_impl.SchemaType([("a", _primitives.Integer), ("b", _primitives.Integer)]), ) with n as reader: df = reader.read() assert df is None
def test_generic_schema_read(): with _test_utils.LocalTestFileSystem(): a = _schema_impl.Schema.create_at_any_location( schema_type=_schema_impl.SchemaType([("a", _primitives.Integer), ("b", _primitives.Integer)]) ) with a as writer: writer.write(_pd.DataFrame.from_dict({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]})) b = _schema_impl.Schema.fetch(a.remote_prefix, schema_type=_schema_impl.SchemaType([])) with b as reader: df = reader.read() assert df.columns.values.tolist() == ["a", "b"] assert df["a"].tolist() == [1, 2, 3, 4] assert df["b"].tolist() == [5, 6, 7, 8]
def single_dataframe(): df1 = _pd.DataFrame.from_dict({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) s = _schema_impl.Schema.from_python_std( t_value=df1, schema_type=_schema_impl.SchemaType([("a", _primitives.Integer), ("b", _primitives.Integer)]), ) assert s is not None n = _schema_impl.Schema.fetch( s.uri, schema_type=_schema_impl.SchemaType([("a", _primitives.Integer), ("b", _primitives.Integer)]), ) with n as reader: df2 = reader.read() assert df2.columns.values.all() == df1.columns.values.all() assert df2["b"].tolist() == df1["b"].tolist()
def test_fetch(value_type_pair): column_name, flyte_type, values = value_type_pair values = [tuple([value]) for value in values] schema_type = _schema_impl.SchemaType(columns=[(column_name, flyte_type)]) with _utils.AutoDeletingTempDir("test") as tmpdir: for i in _six_moves.range(3): _pd.DataFrame.from_records(values, columns=[ column_name ]).to_parquet(tmpdir.get_named_tempfile(str(i).zfill(6)), coerce_timestamps='us') with _utils.AutoDeletingTempDir("test2") as local_dir: schema_obj = _schema_impl.Schema.fetch( tmpdir.name, local_path=local_dir.get_named_tempfile('schema_test'), schema_type=schema_type) with schema_obj as reader: for df in reader.iter_chunks(): for check, actual in _six_moves.zip( values, df[column_name].tolist()): assert check[0] == actual assert reader.read() is None reader.seek(0) df = reader.read(concat=True) for iter_count, actual in enumerate(df[column_name].tolist()): assert values[iter_count % len(values)][0] == actual
def test_simple_read_and_write_with_different_types(value_type_pair): column_name, flyte_type, values = value_type_pair values = [tuple([value]) for value in values] schema_type = _schema_impl.SchemaType(columns=[(column_name, flyte_type)]) with _test_utils.LocalTestFileSystem() as sandbox: with _utils.AutoDeletingTempDir("test") as t: a = _schema_impl.Schema.create_at_known_location( t.name, mode='wb', schema_type=schema_type) assert a.local_path is None with a as writer: for _ in _six_moves.range(5): writer.write( _pd.DataFrame.from_records(values, columns=[column_name])) assert a.local_path.startswith(sandbox.name) assert a.local_path is None b = _schema_impl.Schema.create_at_known_location( t.name, mode='rb', schema_type=schema_type) assert b.local_path is None with b as reader: for df in reader.iter_chunks(): for check, actual in _six_moves.zip( values, df[column_name].tolist()): assert check[0] == actual assert reader.read() is None reader.seek(0) df = reader.read(concat=True) for iter_count, actual in enumerate(df[column_name].tolist()): assert values[iter_count % len(values)][0] == actual assert b.local_path.startswith(sandbox.name) assert b.local_path is None
def test_datetime_coercion(): values = [ tuple( [ _datetime.datetime(day=1, month=1, year=2017, hour=1, minute=1, second=1, microsecond=1) - _datetime.timedelta(days=x) ] ) for x in _six_moves.range(5) ] schema_type = _schema_impl.SchemaType(columns=[("testname", _primitives.Datetime)]) with _test_utils.LocalTestFileSystem(): with _utils.AutoDeletingTempDir("test") as t: a = _schema_impl.Schema.create_at_known_location(t.name, mode="wb", schema_type=schema_type) with a as writer: for _ in _six_moves.range(5): # us to ms coercion segfaults unless we explicitly allow truncation. writer.write( _pd.DataFrame.from_records(values, columns=["testname"]), coerce_timestamps="ms", allow_truncated_timestamps=True, ) # TODO: Uncomment when segfault bug is resolved # with _pytest.raises(Exception): # writer.write( # _pd.DataFrame.from_records(values, columns=['testname']), # coerce_timestamps='ms') b = _schema_impl.Schema.create_at_known_location(t.name, mode="wb", schema_type=schema_type) with b as writer: for _ in _six_moves.range(5): writer.write(_pd.DataFrame.from_records(values, columns=["testname"]))
def mixed_list(): df1 = _pd.DataFrame.from_dict({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) df2 = [1, 2, 3] with _pytest.raises(_user_exceptions.FlyteTypeException): _schema_impl.Schema.from_python_std( t_value=[df1, df2], schema_type=_schema_impl.SchemaType([("a", _primitives.Integer), ("b", _primitives.Integer)]), )
def list_of_dataframes(): df1 = _pd.DataFrame.from_dict({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) df2 = _pd.DataFrame.from_dict({"a": [9, 10, 11, 12], "b": [13, 14, 15, 16]}) s = _schema_impl.Schema.from_python_std( t_value=[df1, df2], schema_type=_schema_impl.SchemaType([("a", _primitives.Integer), ("b", _primitives.Integer)]), ) assert s is not None n = _schema_impl.Schema.fetch( s.uri, schema_type=_schema_impl.SchemaType([("a", _primitives.Integer), ("b", _primitives.Integer)]), ) with n as reader: actual = [] for df in reader.iter_chunks(): assert df.columns.values.all() == df1.columns.values.all() actual.extend(df["b"].tolist()) b_val = df1["b"].tolist() b_val.extend(df2["b"].tolist()) assert actual == b_val
def test_partial_column_read(): with _test_utils.LocalTestFileSystem(): a = _schema_impl.Schema.create_at_any_location( schema_type=_schema_impl.SchemaType([( 'a', _primitives.Integer), ('b', _primitives.Integer)])) with a as writer: writer.write( _pd.DataFrame.from_dict({ 'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8] })) b = _schema_impl.Schema.fetch(a.uri, schema_type=_schema_impl.SchemaType([ ('a', _primitives.Integer), ('b', _primitives.Integer) ])) with b as reader: df = reader.read(columns=['b']) assert df.columns.values == ['b'] assert df['b'].tolist() == [5, 6, 7, 8]
def test_create_at_known_location(): with _test_utils.LocalTestFileSystem(): with _utils.AutoDeletingTempDir("test") as wd: b = _schema_impl.Schema.create_at_known_location(wd.name, schema_type=_schema_impl.SchemaType()) assert b.local_path is None assert b.remote_location == wd.name + "/" assert b.mode == "wb" with b as w: w.write(_pd.DataFrame.from_dict({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]})) df = _pd.read_parquet(_os.path.join(wd.name, "000000")) assert list(df["a"]) == [1, 2, 3, 4] assert list(df["b"]) == [5, 6, 7, 8]
def test_schema_type(): _schema_impl.SchemaType() _schema_impl.SchemaType([]) _schema_impl.SchemaType([('a', _primitives.Integer), ('b', _primitives.String), ('c', _primitives.Float), ('d', _primitives.Boolean), ('e', _primitives.Datetime)]) with _pytest.raises(ValueError): _schema_impl.SchemaType({'a': _primitives.Integer}) with _pytest.raises(TypeError): _schema_impl.SchemaType([('a', _blobs.Blob)]) with _pytest.raises(ValueError): _schema_impl.SchemaType([('a', _primitives.Integer, 1)]) _schema_impl.SchemaType([('1', _primitives.Integer)]) with _pytest.raises(TypeError): _schema_impl.SchemaType([(1, _primitives.Integer)]) with _pytest.raises(TypeError): _schema_impl.SchemaType([('1', [_primitives.Integer])])
def test_download(value_type_pair): column_name, flyte_type, values = value_type_pair values = [tuple([value]) for value in values] schema_type = _schema_impl.SchemaType(columns=[(column_name, flyte_type)]) with _utils.AutoDeletingTempDir("test") as tmpdir: for i in _six_moves.range(3): _pd.DataFrame.from_records(values, columns=[column_name]).to_parquet( tmpdir.get_named_tempfile(str(i).zfill(6)), coerce_timestamps="us" ) with _utils.AutoDeletingTempDir("test2") as local_dir: schema_obj = _schema_impl.Schema(tmpdir.name, schema_type=schema_type) schema_obj.download(local_dir.get_named_tempfile(_uuid.uuid4().hex)) with schema_obj as reader: for df in reader.iter_chunks(): for check, actual in _six_moves.zip(values, df[column_name].tolist()): assert check[0] == actual assert reader.read() is None reader.seek(0) df = reader.read(concat=True) for iter_count, actual in enumerate(df[column_name].tolist()): assert values[iter_count % len(values)][0] == actual with _pytest.raises(Exception): schema_obj = _schema_impl.Schema(tmpdir.name, schema_type=schema_type) schema_obj.download() with _test_utils.LocalTestFileSystem(): schema_obj = _schema_impl.Schema(tmpdir.name, schema_type=schema_type) schema_obj.download() with schema_obj as reader: for df in reader.iter_chunks(): for check, actual in _six_moves.zip(values, df[column_name].tolist()): assert check[0] == actual assert reader.read() is None reader.seek(0) df = reader.read(concat=True) for iter_count, actual in enumerate(df[column_name].tolist()): assert values[iter_count % len(values)][0] == actual
class _Schema(Schema, metaclass=SchemaInstantiator): _schema_type = _schema_impl.SchemaType(columns=columns)
class _Schema(_six.with_metaclass(SchemaInstantiator, Schema)): _schema_type = _schema_impl.SchemaType(columns=columns)
def test_hive_queries(monkeypatch): def return_deterministic_uuid(): class FakeUUID4(object): def __init__(self): self.hex = 'test_uuid' class Uuid(object): def uuid4(self): return FakeUUID4() return Uuid() monkeypatch.setattr(_schema_impl, '_uuid', return_deterministic_uuid()) all_types = _schema_impl.SchemaType([('a', _primitives.Integer), ('b', _primitives.String), ('c', _primitives.Float), ('d', _primitives.Boolean), ('e', _primitives.Datetime)]) with _test_utils.LocalTestFileSystem(): df, query = _schema_impl.Schema.create_from_hive_query( "SELECT a, b, c, d, e FROM some_place WHERE i = 0", stage_query= "CREATE TEMPORARY TABLE some_place AS SELECT * FROM some_place_original", known_location="s3://my_fixed_path/", schema_type=all_types) full_query = """ CREATE TEMPORARY TABLE some_place AS SELECT * FROM some_place_original; CREATE TEMPORARY TABLE test_uuid_tmp AS SELECT a, b, c, d, e FROM some_place WHERE i = 0; CREATE EXTERNAL TABLE test_uuid LIKE test_uuid_tmp STORED AS PARQUET; ALTER TABLE test_uuid SET LOCATION 's3://my_fixed_path/'; INSERT OVERWRITE TABLE test_uuid SELECT a as a, b as b, CAST(c as double) c, d as d, e as e FROM test_uuid_tmp; DROP TABLE test_uuid; """ full_query = " ".join(full_query.split()) query = " ".join(query.split()) assert query == full_query # Test adding partition full_query = """ ALTER TABLE some_table ADD IF NOT EXISTS PARTITION ( region = 'SEA', ds = '2017-01-01' ) LOCATION 's3://my_fixed_path/'; ALTER TABLE some_table PARTITION ( region = 'SEA', ds = '2017-01-01' ) SET LOCATION 's3://my_fixed_path/'; """ query = df.get_write_partition_to_hive_table_query( 'some_table', partitions=_collections.OrderedDict([('region', 'SEA'), ('ds', '2017-01-01')])) full_query = " ".join(full_query.split()) query = " ".join(query.split()) assert query == full_query