def test_table_publish_mixed_type_column(self, mock_session_helper, mock_create_table): dataframe = setup_grouped_dataframe() bucket, key = self.setup_s3() partitions = [] redshift_params = self.setup_redshift_params() msh = mock_session_helper(region=redshift_params['region'], cluster_id=redshift_params['cluster_id'], host=redshift_params['host'], port=redshift_params['port'], db_name=redshift_params['db_name']) msh.configure_session_helper() dataframe.iat[5, dataframe.columns.get_loc("text_col")] = 45 parq.publish(bucket=bucket, key=key, dataframe=dataframe, partitions=partitions, redshift_params=redshift_params) df_types = parq._get_dataframe_datatypes(dataframe, partitions) partition_types = parq._get_dataframe_datatypes( dataframe, partitions, True) mock_create_table.assert_called_once_with( redshift_params['table_name'], redshift_params['schema_name'], df_types, partition_types, parq.s3_url(bucket, key), msh)
def test_table_publish(self, mock_session_helper, mock_create_table): with get_s3_client() as s3_client: dataframe = setup_grouped_dataframe() bucket, key = self.setup_s3(s3_client) partitions = ["text_col", "int_col", "float_col"] redshift_params = self.setup_redshift_params() msh = mock_session_helper(region=redshift_params['region'], cluster_id=redshift_params['cluster_id'], host=redshift_params['host'], port=redshift_params['port'], db_name=redshift_params['db_name']) msh.configure_session_helper() parq.publish(bucket=bucket, key=key, dataframe=dataframe, partitions=partitions, redshift_params=redshift_params) df_types = parq._get_dataframe_datatypes(dataframe, partitions) partition_types = parq._get_dataframe_datatypes( dataframe, partitions, True) mock_create_table.assert_called_once_with( redshift_params['table_name'], redshift_params['schema_name'], df_types, partition_types, parq.s3_url(bucket, key), msh)
def test_dataframe_sans_partitions(self): dataframe = setup_grouped_dataframe() partitions = ["text_col", "int_col", "float_col"] dataframe_dtypes = dataframe.dtypes.to_dict() for part in partitions: dataframe_dtypes.pop(part, None) assert parq._get_dataframe_datatypes(dataframe, partitions) == dataframe_dtypes
def test_partition_datatypes(self): columns, dataframe = self.setup_df() partitions = ["text_col", "int_col", "float_col"] assert parq._get_dataframe_datatypes(dataframe, partitions, True) == { 'text_col': 'object', 'int_col': 'int64', 'float_col': 'float64' }
def test_df_datatypes(self): columns, dataframe = self.setup_df() assert parq._get_dataframe_datatypes(dataframe) == { 'grouped_col': 'object', 'text_col': 'object', 'int_col': 'int64', 'float_col': 'float64' }
def test_partition_datatypes(self): dataframe = setup_grouped_dataframe() partitions = ["text_col", "int_col", "float_col"] dataframe_dtypes = dataframe.dtypes.to_dict() part_dtypes = {} for part in partitions: part_dtypes[part] = dataframe_dtypes.pop(part, None) assert parq._get_dataframe_datatypes(dataframe, partitions, True) == part_dtypes
def test_df_datatypes(self): dataframe = setup_grouped_dataframe() assert parq._get_dataframe_datatypes( dataframe) == dataframe.dtypes.to_dict()
def test_dataframe_sans_partitions(self): columns, dataframe = self.setup_df() partitions = ["text_col", "int_col", "float_col"] assert parq._get_dataframe_datatypes(dataframe, partitions) == { 'grouped_col': 'object' }