def test_custom_table_publish_null_in_int_column(self, mock_session_helper, mock_create_custom_table): with get_s3_client() as s3_client: dataframe, custom_redshift_columns = setup_custom_redshift_columns_and_dataframe_with_null( ) bucket, key = self.setup_s3(s3_client) partitions = [] redshift_params = self.setup_redshift_params() msh = mock_session_helper(region=redshift_params['region'], cluster_id=redshift_params['cluster_id'], host=redshift_params['host'], port=redshift_params['port'], db_name=redshift_params['db_name']) msh.configure_session_helper() parq.custom_publish( bucket=bucket, key=key, dataframe=dataframe, partitions=partitions, redshift_params=redshift_params, custom_redshift_columns=custom_redshift_columns) mock_create_custom_table.assert_called_once_with( redshift_params['table_name'], redshift_params['schema_name'], partitions, parq.s3_url(bucket, key), custom_redshift_columns, msh)
def test_custom_table_publish_mixed_type_column(self, mock_session_helper, mock_create_custom_table): dataframe, custom_redshift_columns = setup_custom_redshift_columns_and_dataframe( ) bucket, key = self.setup_s3() partitions = [] redshift_params = self.setup_redshift_params() msh = mock_session_helper(region=redshift_params['region'], cluster_id=redshift_params['cluster_id'], host=redshift_params['host'], port=redshift_params['port'], db_name=redshift_params['db_name']) msh.configure_session_helper() dataframe.iat[1, dataframe.columns.get_loc("colA")] = 45 parq.custom_publish(bucket=bucket, key=key, dataframe=dataframe, partitions=partitions, redshift_params=redshift_params, custom_redshift_columns=custom_redshift_columns) mock_create_custom_table.assert_called_once_with( redshift_params['table_name'], redshift_params['schema_name'], partitions, parq.s3_url(bucket, key), custom_redshift_columns, msh)
def test_custom_publish_set_metadata_correctly(self): with get_s3_client() as s3_client: dataframe, custom_redshift_columns = setup_custom_redshift_columns_and_dataframe( ) bucket, key = self.setup_s3(s3_client) s3_client = boto3.client('s3') partitions = ['colA', 'colB', 'colC', 'colD', 'colF'] parq.custom_publish( bucket=bucket, key=key, dataframe=dataframe, partitions=partitions, custom_redshift_columns=custom_redshift_columns) for obj in s3_client.list_objects(Bucket=bucket)['Contents']: if obj['Key'].endswith(".parquet"): meta = s3_client.get_object(Bucket=bucket, Key=obj['Key'])['Metadata'] assert meta['partition_data_types'] == str({ "colA": "string", "colB": "integer", "colC": "float", "colD": "decimal", "colF": "boolean" })
def test_custom_publish_no_redshift_publish(self): dataframe, custom_redshift_columns = setup_custom_redshift_columns_and_dataframe( ) bucket, key = self.setup_s3() partitions = [] parq.custom_publish(bucket=bucket, key=key, dataframe=dataframe, partitions=partitions, custom_redshift_columns=custom_redshift_columns)
def test_custom_publish_works_without_partitions(self): with get_s3_client() as s3_client: dataframe, custom_redshift_columns = setup_custom_redshift_columns_and_dataframe( ) bucket, key = self.setup_s3(s3_client) partitions = [] parq.custom_publish( bucket=bucket, key=key, dataframe=dataframe, partitions=partitions, custom_redshift_columns=custom_redshift_columns)
def test_custom_publish_reject_timedelta_dataframes(self): dataframe, custom_redshift_columns = setup_custom_redshift_columns_and_dataframe( ) bucket, key = self.setup_s3() partitions = ['colA'] dataframe['time_col'] = pd.Timedelta('1 days') with pytest.raises(NotImplementedError): parq.custom_publish( bucket=bucket, key=key, dataframe=dataframe, partitions=partitions, custom_redshift_columns=custom_redshift_columns)
def test_custom_publish_reject_empty_dataframe(self): dataframe = pd.DataFrame() custom_redshift_columns = setup_custom_redshift_columns_and_dataframe( )[1] bucket, key = self.setup_s3() s3_path = f"s3://{bucket}/{key}" with pytest.raises(ValueError): parq.custom_publish( bucket=bucket, key=key, dataframe=dataframe, partitions=[], custom_redshift_columns=custom_redshift_columns)
def test_custom_publish_input_equals_output(self): dataframe, custom_redshift_columns = setup_custom_redshift_columns_and_dataframe( ) bucket, key = self.setup_s3() s3_path = f"s3://{bucket}/{key}" partitions = [dataframe.columns[0]] parq.custom_publish(bucket=bucket, key=key, dataframe=dataframe, partitions=partitions, custom_redshift_columns=custom_redshift_columns) from_s3 = pq.ParquetDataset(s3_path, filesystem=s3fs.S3FileSystem()) s3pd = from_s3.read().to_pandas() # Switch partition type back -> by default it gets set to a category s3pd[partitions[0]] = s3pd[partitions[0]].astype( dataframe[partitions[0]].dtype) sorted_dfs_equal_by_pandas_testing(dataframe, s3pd)
def test_custom_publish_schema_publish(self, mock_session_helper, mock_create_schema): dataframe, custom_redshift_columns = setup_custom_redshift_columns_and_dataframe( ) bucket, key = self.setup_s3() partitions = [dataframe.columns[0]] redshift_params = self.setup_redshift_params() msh = mock_session_helper(region=redshift_params['region'], cluster_id=redshift_params['cluster_id'], host=redshift_params['host'], port=redshift_params['port'], db_name=redshift_params['db_name']) msh.configure_session_helper() parq.custom_publish(bucket=bucket, key=key, dataframe=dataframe, partitions=partitions, redshift_params=redshift_params, custom_redshift_columns=custom_redshift_columns) mock_create_schema.assert_called_once_with( redshift_params['schema_name'], redshift_params['db_name'], redshift_params['iam_role'], msh)