def test_table(): json = """ { "name" : "table_name", "share" : "share_name", "schema" : "schema_name" } """ table = Table.from_json(json) assert table == Table("table_name", "share_name", "schema_name")
def test_list_tables(sharing_client: SharingClient): tables = sharing_client.list_tables(Schema(name="default", share="share1")) assert tables == [ Table(name="table1", share="share1", schema="default"), Table(name="table3", share="share1", schema="default"), Table(name="table7", share="share1", schema="default"), ] tables = sharing_client.list_tables(Schema(name="default", share="share2")) assert tables == [Table(name="table2", share="share2", schema="default")]
def test_list_tables_with_pagination(rest_client: DataSharingRestClient): response = rest_client.list_tables(Schema(name="default", share="share1"), max_results=1) assert response.tables == [ Table(name="table1", share="share1", schema="default"), ] response = rest_client.list_tables(Schema(name="default", share="share1"), page_token=response.next_page_token) assert response.tables == [ Table(name="table3", share="share1", schema="default"), Table(name="table7", share="share1", schema="default"), ]
def test_list_tables(rest_client: DataSharingRestClient): response = rest_client.list_tables(Schema(name="default", share="share1")) assert response.tables == [ Table(name="table1", share="share1", schema="default"), Table(name="table3", share="share1", schema="default"), Table(name="table7", share="share1", schema="default"), ] response = rest_client.list_tables(Schema(name="default", share="share2")) assert response.tables == [ Table(name="table2", share="share2", schema="default") ]
def test_to_pandas_empty(rest_client: DataSharingRestClient): class RestClientMock: def list_files_in_table( self, table: Table, *, predicateHints: Optional[Sequence[str]] = None, limitHint: Optional[int] = None, ) -> ListFilesInTableResponse: assert table == Table("table_name", "share_name", "schema_name") metadata = Metadata(schema_string=( '{"fields":[' '{"metadata":{},"name":"a","nullable":true,"type":"boolean"},' '{"metadata":{},"name":"b","nullable":true,"type":"byte"},' '{"metadata":{},"name":"c","nullable":true,"type":"short"},' '{"metadata":{},"name":"d","nullable":true,"type":"integer"},' '{"metadata":{},"name":"e","nullable":true,"type":"long"},' '{"metadata":{},"name":"f","nullable":true,"type":"float"},' '{"metadata":{},"name":"g","nullable":true,"type":"double"},' '{"metadata":{},"name":"h","nullable":true,"type":"decimal(5,2)"},' '{"metadata":{},"name":"i","nullable":true,"type":"string"},' '{"metadata":{},"name":"j","nullable":true,"type":"binary"},' '{"metadata":{},"name":"k","nullable":true,"type":"timestamp"},' '{"metadata":{},"name":"l","nullable":true,"type":"date"},' '{"metadata":{},"name":"m","nullable":true,"type":{"type":"array",' '"elementType":"string","containsNull":true}},' '{"metadata":{},"name":"n","nullable":true,"type":{"type":"struct","fields":' '[{"name":"foo","type":"string","nullable":true,"metadata":{}},' '{"name":"bar","type":"integer","nullable":true,"metadata":{}}]}},' '{"metadata":{},"name":"o","nullable":true,"type":{"type":"map",' '"keyType":"string","valueType":"integer","valueContainsNull":true}}' '],"type":"struct"}')) add_files: Sequence[AddFile] = [] return ListFilesInTableResponse(protocol=None, metadata=metadata, add_files=add_files) reader = DeltaSharingReader( Table("table_name", "share_name", "schema_name"), RestClientMock() # type: ignore ) pdf = reader.to_pandas() reader = DeltaSharingReader( Table(name="table7", share="share1", schema="default"), rest_client) expected = reader.to_pandas().iloc[0:0] pd.testing.assert_frame_equal(pdf, expected)
def list_files_in_table( self, table: Table, *, predicateHints: Optional[Sequence[str]] = None, limitHint: Optional[int] = None, ) -> ListFilesInTableResponse: assert table == Table("table_name", "share_name", "schema_name") metadata = Metadata(schema_string=( '{"fields":[' '{"metadata":{},"name":"a","nullable":true,"type":"long"},' '{"metadata":{},"name":"b","nullable":true,"type":"string"}' '],"type":"struct"}')) add_files = [ AddFile( url=str(tmp_path / "pdf1.parquet"), id="pdf1", partition_values={"b": "x"}, size=0, stats="", ), AddFile( url=str(tmp_path / "pdf2.parquet"), id="pdf2", partition_values={"b": "y"}, size=0, stats="", ), ] return ListFilesInTableResponse(protocol=None, metadata=metadata, add_files=add_files)
def list_files_in_table( self, table: Table, *, predicateHints: Optional[Sequence[str]] = None, limitHint: Optional[int] = None, ) -> ListFilesInTableResponse: assert table == Table("table_name", "share_name", "schema_name") metadata = Metadata(schema_string=( '{"fields":[' '{"metadata":{},"name":"a","nullable":true,"type":"boolean"},' '{"metadata":{},"name":"b","nullable":true,"type":"byte"},' '{"metadata":{},"name":"c","nullable":true,"type":"short"},' '{"metadata":{},"name":"d","nullable":true,"type":"integer"},' '{"metadata":{},"name":"e","nullable":true,"type":"long"},' '{"metadata":{},"name":"f","nullable":true,"type":"float"},' '{"metadata":{},"name":"g","nullable":true,"type":"double"},' '{"metadata":{},"name":"h","nullable":true,"type":"decimal(5,2)"},' '{"metadata":{},"name":"i","nullable":true,"type":"string"},' '{"metadata":{},"name":"j","nullable":true,"type":"binary"},' '{"metadata":{},"name":"k","nullable":true,"type":"timestamp"},' '{"metadata":{},"name":"l","nullable":true,"type":"date"},' '{"metadata":{},"name":"m","nullable":true,"type":{"type":"array",' '"elementType":"string","containsNull":true}},' '{"metadata":{},"name":"n","nullable":true,"type":{"type":"struct","fields":' '[{"name":"foo","type":"string","nullable":true,"metadata":{}},' '{"name":"bar","type":"integer","nullable":true,"metadata":{}}]}},' '{"metadata":{},"name":"o","nullable":true,"type":{"type":"map",' '"keyType":"string","valueType":"integer","valueContainsNull":true}}' '],"type":"struct"}')) add_files: Sequence[AddFile] = [] return ListFilesInTableResponse(protocol=None, metadata=metadata, add_files=add_files)
def test_to_pandas_partitioned_different_schemas(tmp_path): pdf1 = pd.DataFrame({"a": [1, 2, 3]}) pdf2 = pd.DataFrame({"a": [4.0, 5.0, 6.0], "b": ["d", "e", "f"]}) pdf1.to_parquet(tmp_path / "pdf1.parquet") pdf2.to_parquet(tmp_path / "pdf2.parquet") class RestClientMock: def list_files_in_table( self, table: Table, *, predicateHints: Optional[Sequence[str]] = None, limitHint: Optional[int] = None, ) -> ListFilesInTableResponse: assert table == Table("table_name", "share_name", "schema_name") metadata = Metadata(schema_string=( '{"fields":[' '{"metadata":{},"name":"a","nullable":true,"type":"long"},' '{"metadata":{},"name":"b","nullable":true,"type":"string"},' '{"metadata":{},"name":"c","nullable":true,"type":"date"}' '],"type":"struct"}')) add_files = [ AddFile( url=str(tmp_path / "pdf1.parquet"), id="pdf1", partition_values={"c": "2021-01-01"}, size=0, stats="", ), AddFile( url=str(tmp_path / "pdf2.parquet"), id="pdf2", partition_values={"c": "2021-01-02"}, size=0, stats="", ), ] return ListFilesInTableResponse(protocol=None, metadata=metadata, add_files=add_files) reader = DeltaSharingReader( Table("table_name", "share_name", "schema_name"), RestClientMock()) pdf = reader.to_pandas() expected1 = pdf1.copy() expected1["c"] = date(2021, 1, 1) expected2 = pdf2.copy() expected2["c"] = date(2021, 1, 2) expected = pd.concat([expected1, expected2])[["a", "b", "c"]].reset_index(drop=True) pd.testing.assert_frame_equal(pdf, expected)
def test_query_table_metadata_partitioned(rest_client: DataSharingRestClient): response = rest_client.query_table_metadata( Table(name="table2", share="share2", schema="default")) assert response.protocol == Protocol(min_reader_version=1) assert response.metadata == Metadata( id="f8d5c169-3d01-4ca3-ad9e-7dc3355aedb2", format=Format(provider="parquet", options={}), schema_string= ('{"type":"struct","fields":[' '{"name":"eventTime","type":"timestamp","nullable":true,"metadata":{}},' '{"name":"date","type":"date","nullable":true,"metadata":{}}' "]}"), partition_columns=["date"], )
def test_list_files_in_table_partitioned_different_schemas( rest_client: DataSharingRestClient): response = rest_client.list_files_in_table( Table(name="table3", share="share1", schema="default")) assert response.protocol == Protocol(min_reader_version=1) assert response.metadata == Metadata( id="7ba6d727-a578-4234-a138-953f790b427c", format=Format(provider="parquet", options={}), schema_string= ('{"type":"struct","fields":[' '{"name":"eventTime","type":"timestamp","nullable":true,"metadata":{}},' '{"name":"date","type":"date","nullable":true,"metadata":{}},' '{"name":"type","type":"string","nullable":true,"metadata":{}}' "]}"), partition_columns=["date"], ) assert response.add_files == [ AddFile( url=response.add_files[0].url, id="db213271abffec6fd6c7fc2aad9d4b3f", partition_values={"date": "2021-04-28"}, size=778, stats= (r'{"numRecords":1,' r'"minValues":{"eventTime":"2021-04-28T23:36:51.945Z","type":"bar"},' r'"maxValues":{"eventTime":"2021-04-28T23:36:51.945Z","type":"bar"},' r'"nullCount":{"eventTime":0,"type":0}}'), ), AddFile( url=response.add_files[1].url, id="f1f8be229d8b18eb6d6a34255f2d7089", partition_values={"date": "2021-04-28"}, size=778, stats= (r'{"numRecords":1,' r'"minValues":{"eventTime":"2021-04-28T23:36:47.599Z","type":"foo"},' r'"maxValues":{"eventTime":"2021-04-28T23:36:47.599Z","type":"foo"},' r'"nullCount":{"eventTime":0,"type":0}}'), ), AddFile( url=response.add_files[2].url, id="a892a55d770ee70b34ffb2ebf7dc2fd0", partition_values={"date": "2021-04-28"}, size=573, stats=(r'{"numRecords":1,' r'"minValues":{"eventTime":"2021-04-28T23:35:53.156Z"},' r'"maxValues":{"eventTime":"2021-04-28T23:35:53.156Z"},' r'"nullCount":{"eventTime":0}}'), ), ]
def test_query_table_metadata_non_partitioned( rest_client: DataSharingRestClient): response = rest_client.query_table_metadata( Table(name="table1", share="share1", schema="default")) assert response.protocol == Protocol(min_reader_version=1) assert response.metadata == Metadata( id="ed96aa41-1d81-4b7f-8fb5-846878b4b0cf", format=Format(provider="parquet", options={}), schema_string= ('{"type":"struct","fields":[' '{"name":"eventTime","type":"timestamp","nullable":true,"metadata":{}},' '{"name":"date","type":"date","nullable":true,"metadata":{}}' "]}"), partition_columns=[], )
def test_query_table_metadata_partitioned_different_schemas( rest_client: DataSharingRestClient): response = rest_client.query_table_metadata( Table(name="table3", share="share1", schema="default")) assert response.protocol == Protocol(min_reader_version=1) assert response.metadata == Metadata( id="7ba6d727-a578-4234-a138-953f790b427c", format=Format(provider="parquet", options={}), schema_string= ('{"type":"struct","fields":[' '{"name":"eventTime","type":"timestamp","nullable":true,"metadata":{}},' '{"name":"date","type":"date","nullable":true,"metadata":{}},' '{"name":"type","type":"string","nullable":true,"metadata":{}}' "]}"), partition_columns=["date"], )
def load_as_pandas(url: str, limit: Optional[int] = None) -> pd.DataFrame: """ Load the shared table using the give url as a pandas DataFrame. :param url: a url under the format "<profile>#<share>.<schema>.<table>" :param limit: a non-negative int. Load only the ``limit`` rows if the parameter is specified. Use this optional parameter to explore the shared table without loading the entire table to the memory. :return: A pandas DataFrame representing the shared table. """ profile_json, share, schema, table = _parse_url(url) profile = DeltaSharingProfile.read_from_file(profile_json) return DeltaSharingReader( table=Table(name=table, share=share, schema=schema), rest_client=DataSharingRestClient(profile), limit=limit, ).to_pandas()
def test_list_files_in_table_non_partitioned( rest_client: DataSharingRestClient): response = rest_client.list_files_in_table( Table(name="table1", share="share1", schema="default"), predicateHints=["date = '2021-01-31'"], ) assert response.protocol == Protocol(min_reader_version=1) assert response.metadata == Metadata( id="ed96aa41-1d81-4b7f-8fb5-846878b4b0cf", format=Format(provider="parquet", options={}), schema_string= ('{"type":"struct","fields":[' '{"name":"eventTime","type":"timestamp","nullable":true,"metadata":{}},' '{"name":"date","type":"date","nullable":true,"metadata":{}}' "]}"), partition_columns=[], ) assert response.add_files == [ AddFile( url=response.add_files[0].url, id="061cb3683a467066995f8cdaabd8667d", partition_values={}, size=781, stats= (r'{"numRecords":1,' r'"minValues":{"eventTime":"2021-04-28T06:32:22.421Z","date":"2021-04-28"},' r'"maxValues":{"eventTime":"2021-04-28T06:32:22.421Z","date":"2021-04-28"},' r'"nullCount":{"eventTime":0,"date":0}}'), ), AddFile( url=response.add_files[1].url, id="e268cbf70dbaa6143e7e9fa3e2d3b00e", partition_values={}, size=781, stats= (r'{"numRecords":1,' r'"minValues":{"eventTime":"2021-04-28T06:32:02.070Z","date":"2021-04-28"},' r'"maxValues":{"eventTime":"2021-04-28T06:32:02.070Z","date":"2021-04-28"},' r'"nullCount":{"eventTime":0,"date":0}}'), ), ]
def list_all_tables( self, share: Share, *, max_results: Optional[int] = None, page_token: Optional[str] = None) -> ListAllTablesResponse: data: Dict = {} if max_results is not None: data["maxResults"] = max_results if page_token is not None: data["pageToken"] = page_token with self._get_internal(f"/shares/{share.name}/all-tables", data) as lines: tables_json = json.loads(next(lines)) return ListAllTablesResponse( tables=[ Table.from_json(table_json) for table_json in tables_json.get("items", []) ], next_page_token=tables_json.get("nextPageToken", None), )
def test_list_files_in_table_partitioned(rest_client: DataSharingRestClient): response = rest_client.list_files_in_table( Table(name="table2", share="share2", schema="default"), predicateHints=["date = '2021-01-31'"], limitHint=123, ) assert response.protocol == Protocol(min_reader_version=1) assert response.metadata == Metadata( id="f8d5c169-3d01-4ca3-ad9e-7dc3355aedb2", format=Format(provider="parquet", options={}), schema_string= ('{"type":"struct","fields":[' '{"name":"eventTime","type":"timestamp","nullable":true,"metadata":{}},' '{"name":"date","type":"date","nullable":true,"metadata":{}}' "]}"), partition_columns=["date"], ) assert response.add_files == [ AddFile( url=response.add_files[0].url, id="9f1a49539c5cffe1ea7f9e055d5c003c", partition_values={"date": "2021-04-28"}, size=573, stats=(r'{"numRecords":1,' r'"minValues":{"eventTime":"2021-04-28T23:33:57.955Z"},' r'"maxValues":{"eventTime":"2021-04-28T23:33:57.955Z"},' r'"nullCount":{"eventTime":0}}'), ), AddFile( url=response.add_files[1].url, id="cd2209b32f5ed5305922dd50f5908a75", partition_values={"date": "2021-04-28"}, size=573, stats=(r'{"numRecords":1,' r'"minValues":{"eventTime":"2021-04-28T23:33:48.719Z"},' r'"maxValues":{"eventTime":"2021-04-28T23:33:48.719Z"},' r'"nullCount":{"eventTime":0}}'), ), ]
def _verify_all_tables_result(tables: Sequence[Table]): assert tables == [ Table(name="table1", share="share1", schema="default"), Table(name="table3", share="share1", schema="default"), Table(name="table7", share="share1", schema="default"), Table(name="table2", share="share2", schema="default"), Table(name="table4", share="share3", schema="default"), Table(name="table5", share="share3", schema="default"), Table(name="test_gzip", share="share4", schema="default"), Table(name="table8", share="share7", schema="schema1"), Table(name="table9", share="share7", schema="schema2"), Table(name="table_wasb", share="share_azure", schema="default"), Table(name="table_abfs", share="share_azure", schema="default"), Table(name="table_gcs", share="share_gcp", schema="default"), ]