def test_s3_parquet_to_dataframe(): with get_s3_client() as s3_client: columns = { "string_col": "string", "int_col": "integer", "float_col": "float", "bool_col": "boolean", "datetime_col": "datetime" } bucket = "foobucket" key = "fookey" df = setup_grouped_dataframe(count=10, columns=columns) bucket, parquet_paths = setup_partitioned_parquet( dataframe=df, bucket=bucket, key=key, partition_data_types={}, s3_client=s3_client) first_published_file = parquet_paths[0] response = fetch_parq._s3_parquet_to_dataframe( bucket=bucket, key=first_published_file, partition_metadata={}) assert isinstance(response, pd.DataFrame) for col in columns.keys(): assert (col in response.columns) assert response.shape == df.shape sorted_dfs_equal_by_pandas_testing(response, df)
def test_table_publish_mixed_type_column(self, mock_session_helper, mock_create_table): dataframe = setup_grouped_dataframe() bucket, key = self.setup_s3() partitions = [] redshift_params = self.setup_redshift_params() msh = mock_session_helper(region=redshift_params['region'], cluster_id=redshift_params['cluster_id'], host=redshift_params['host'], port=redshift_params['port'], db_name=redshift_params['db_name']) msh.configure_session_helper() dataframe.iat[5, dataframe.columns.get_loc("text_col")] = 45 parq.publish(bucket=bucket, key=key, dataframe=dataframe, partitions=partitions, redshift_params=redshift_params) df_types = parq._get_dataframe_datatypes(dataframe, partitions) partition_types = parq._get_dataframe_datatypes( dataframe, partitions, True) mock_create_table.assert_called_once_with( redshift_params['table_name'], redshift_params['schema_name'], df_types, partition_types, parq.s3_url(bucket, key), msh)
def test_table_publish(self, mock_session_helper, mock_create_table): with get_s3_client() as s3_client: dataframe = setup_grouped_dataframe() bucket, key = self.setup_s3(s3_client) partitions = ["text_col", "int_col", "float_col"] redshift_params = self.setup_redshift_params() msh = mock_session_helper(region=redshift_params['region'], cluster_id=redshift_params['cluster_id'], host=redshift_params['host'], port=redshift_params['port'], db_name=redshift_params['db_name']) msh.configure_session_helper() parq.publish(bucket=bucket, key=key, dataframe=dataframe, partitions=partitions, redshift_params=redshift_params) df_types = parq._get_dataframe_datatypes(dataframe, partitions) partition_types = parq._get_dataframe_datatypes( dataframe, partitions, True) mock_create_table.assert_called_once_with( redshift_params['table_name'], redshift_params['schema_name'], df_types, partition_types, parq.s3_url(bucket, key), msh)
def test_s3_partitioned_parquet_to_dataframe(): partition_types = {"string_col": "string", "int_col": "integer", "float_col": "float", "bool_col": "boolean", "datetime_col": "datetime"} columns = dict(partition_types) columns["metrics"] = "int" bucket = "foobucket" key = "fookey" df = setup_grouped_dataframe(count=10, columns=columns) bucket, parquet_paths = setup_partitioned_parquet( dataframe=df, bucket=bucket, key=key, partition_data_types=partition_types ) first_published_file = parquet_paths[0] response = fetch_parq._s3_parquet_to_dataframe( bucket=bucket, key=first_published_file, partition_metadata=partition_types) assert isinstance(response, pd.DataFrame) for col in columns.keys(): assert (col in response.columns) full_response = pd.DataFrame() for path in parquet_paths: full_response = full_response.append(fetch_parq._s3_parquet_to_dataframe( bucket=bucket, key=path, partition_metadata=partition_types)) assert full_response.shape == df.shape sorted_dfs_equal_by_pandas_testing(full_response, df)
def test_works_without_partitions(self): dataframe = setup_grouped_dataframe() bucket, key = self.setup_s3() partitions = [] parq.publish(bucket=bucket, key=key, dataframe=dataframe, partitions=partitions)
def test_dataframe_sans_partitions(self): dataframe = setup_grouped_dataframe() partitions = ["text_col", "int_col", "float_col"] dataframe_dtypes = dataframe.dtypes.to_dict() for part in partitions: dataframe_dtypes.pop(part, None) assert parq._get_dataframe_datatypes(dataframe, partitions) == dataframe_dtypes
def test_no_redshift_publish(self): dataframe = setup_grouped_dataframe() bucket, key = self.setup_s3() partitions = [] parq.publish(bucket=bucket, key=key, dataframe=dataframe, partitions=partitions)
def test_generates_partitions_in_order(self): dataframe = setup_grouped_dataframe() bucket, key = self.setup_s3() partitions = dataframe.columns[:2] with patch('s3parq.publish_parq.pq.write_to_dataset', return_value=None) as mock_method: parq._gen_parquet_to_s3(bucket, key, dataframe, partitions) arg, kwarg = mock_method.call_args assert (kwarg['partition_cols'] == partitions).all()
def test_no_redshift_publish(self): with get_s3_client() as s3_client: dataframe = setup_grouped_dataframe() bucket, key = self.setup_s3(s3_client) partitions = [] parq.publish(bucket=bucket, key=key, dataframe=dataframe, partitions=partitions)
def test_partition_datatypes(self): dataframe = setup_grouped_dataframe() partitions = ["text_col", "int_col", "float_col"] dataframe_dtypes = dataframe.dtypes.to_dict() part_dtypes = {} for part in partitions: part_dtypes[part] = dataframe_dtypes.pop(part, None) assert parq._get_dataframe_datatypes(dataframe, partitions, True) == part_dtypes
def test_reject_timedelta_dataframes(self): dataframe = setup_grouped_dataframe() bucket, key = self.setup_s3() partitions = ['text_col'] dataframe['time_col'] = pd.Timedelta('1 days') with pytest.raises(NotImplementedError): parq.publish(bucket=bucket, key=key, dataframe=dataframe, partitions=partitions)
def test_input_equals_output(self): dataframe = setup_grouped_dataframe() bucket, key = self.setup_s3() s3_path = f"s3://{bucket}/{key}" partitions = [dataframe.columns[0]] parq.publish(bucket=bucket, key=key, dataframe=dataframe, partitions=partitions) from_s3 = pq.ParquetDataset(s3_path, filesystem=s3fs.S3FileSystem()) s3pd = from_s3.read().to_pandas() # Switch partition type back -> by default it gets set to a category s3pd[partitions[0]] = s3pd[partitions[0]].astype( dataframe[partitions[0]].dtype) sorted_dfs_equal_by_pandas_testing(dataframe, s3pd)
def test_gets_max(): key = "safekeyprefixname/safedatasetname" bucket = "safebucketname" part_types = {"int_col": "int", "float_col": "float"} df = setup_grouped_dataframe(count=10, columns=part_types) bucket, parquet_paths = setup_partitioned_parquet( dataframe=df, bucket=bucket, key=key, partition_data_types={"int_col": "int"}) fetched_max = fetch_parq.get_max_partition_value(bucket=bucket, key=key, partition="int_col") # Test max of column is max of the fetched partition assert df["int_col"].max() == fetched_max
def test_gets_max_denies_text(): key = "safekeyprefixname/safedatasetname" bucket = "safebucketname" part_types = {"string_col": "string", "bool_col": "bool"} col_types = dict(part_types) col_types["metrics"] = "int" df = setup_grouped_dataframe(count=10, columns=col_types) bucket, parquet_paths = setup_partitioned_parquet( dataframe=df, bucket=bucket, key=key, partition_data_types=part_types) with pytest.raises(ValueError): fetched_max = fetch_parq.get_max_partition_value( bucket=bucket, key=key, partition="string_col") with pytest.raises(ValueError): fetched_max = fetch_parq.get_max_partition_value(bucket=bucket, key=key, partition="bool_col")
def test_set_metadata_correctly(self): dataframe = setup_grouped_dataframe() bucket, key = self.setup_s3() s3_client = boto3.client('s3') partitions = ['string_col', 'int_col', 'bool_col'] parq.publish(bucket=bucket, key=key, dataframe=dataframe, partitions=partitions) for obj in s3_client.list_objects(Bucket=bucket)['Contents']: if obj['Key'].endswith(".parquet"): meta = s3_client.get_object(Bucket=bucket, Key=obj['Key'])['Metadata'] assert meta['partition_data_types'] == str({ "string_col": "string", "int_col": "integer", "bool_col": "boolean" })
def test_schema_publish(self, mock_session_helper, mock_create_schema): dataframe = setup_grouped_dataframe() bucket, key = self.setup_s3() partitions = [dataframe.columns[0]] redshift_params = self.setup_redshift_params() msh = mock_session_helper(region=redshift_params['region'], cluster_id=redshift_params['cluster_id'], host=redshift_params['host'], port=redshift_params['port'], db_name=redshift_params['db_name']) msh.configure_session_helper() parq.publish(bucket=bucket, key=key, dataframe=dataframe, partitions=partitions, redshift_params=redshift_params) mock_create_schema.assert_called_once_with( redshift_params['schema_name'], redshift_params['db_name'], redshift_params['iam_role'], msh)
def test_accepts_valid_partitions(self): dataframe = setup_grouped_dataframe() parq.check_partitions(dataframe.columns, dataframe)
def test_df_datatypes(self): dataframe = setup_grouped_dataframe() assert parq._get_dataframe_datatypes( dataframe) == dataframe.dtypes.to_dict()
def test_reject_non_column_partitions(self): dataframe = setup_grouped_dataframe() with pytest.raises(ValueError): parq.check_partitions(('banana', ), dataframe)