def spark_session(): pyspark = import_or_none('pyspark.sql') if pyspark: spark = pyspark.SparkSession.builder \ .master("local[*]") \ .config("spark.driver.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=True") \ .config("spark.sql.shuffle.partitions", "2") \ .getOrCreate() return spark
def spark_session(): pyspark = import_or_none("pyspark.sql") if pyspark: spark = (pyspark.SparkSession.builder.master("local[*]").config( "spark.driver.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=True", ).config("spark.sql.shuffle.partitions", "2").config("spark.driver.bindAddress", "127.0.0.1").getOrCreate()) return spark
def test_import_or_none(): assert import_or_none("pandas") == pd assert import_or_none("nonexistent") is None
_infer_datetime_format, _is_latlong_nan, _is_nan, _is_s3, _is_url, _is_valid_latlong_series, _is_valid_latlong_value, _parse_logical_type, _reformat_to_latlong, camel_to_snake, get_valid_mi_types, import_or_none, import_or_raise, ) dd = import_or_none("dask.dataframe") ps = import_or_none("pyspark.pandas") def test_camel_to_snake(): test_items = { "PostalCode": "postal_code", "SubRegionCode": "sub_region_code", "NaturalLanguage": "natural_language", "Categorical": "categorical", } for key, value in test_items.items(): assert camel_to_snake(key) == value
from woodwork.utils import import_or_none UNSUPPORTED_KOALAS_DTYPES = [ 'int32', 'intp', 'uint8', 'uint16', 'uint32', 'uint64', 'uintp', 'float_', 'object', 'category', ] ks = import_or_none('databricks.koalas') def get_koalas_dtypes(dtypes): return [ dtype for dtype in dtypes if dtype not in UNSUPPORTED_KOALAS_DTYPES ] def test_integer_inference(integers): dtypes = ['int8', 'int16', 'int32', 'int64', 'intp', 'int', 'Int64'] if ks and isinstance(integers[0], ks.Series): dtypes = get_koalas_dtypes(dtypes) for series in integers: for dtype in dtypes:
import pandas as pd from woodwork.utils import import_or_none dd = import_or_none('dask.dataframe') ks = import_or_none('databricks.koalas') def validate_subset_schema(subset_schema, schema): assert subset_schema.name == schema.name for subset_col_name, subset_col in subset_schema.columns.items(): assert subset_col_name in schema.columns col = schema.columns[subset_col_name] assert subset_col.logical_type == col.logical_type assert subset_col.semantic_tags == col.semantic_tags def mi_between_cols(col1, col2, df): mi_series = df.loc[df['column_1'] == col1].loc[df['column_2'] == col2]['mutual_info'] if len(mi_series) == 0: mi_series = df.loc[df['column_1'] == col2].loc[df['column_2'] == col1]['mutual_info'] return mi_series.iloc[0] def to_pandas(df, index=None, sort_index=False): """Testing util to convert dataframes to pandas. If a pandas dataframe is passed in, just returns the dataframe.
import pytest from woodwork.exceptions import WoodworkNotInitError from woodwork.indexers import _iLocIndexer, _locIndexer from woodwork.logical_types import ( Categorical, Datetime, Double, EmailAddress, Integer, PhoneNumber ) from woodwork.tests.testing_utils import to_pandas from woodwork.utils import import_or_none dd = import_or_none('dask.dataframe') def test_iLocIndexer_class_error(sample_df_dask, sample_series_dask): with pytest.raises(TypeError, match="iloc is not supported for Dask DataFrames"): _iLocIndexer(sample_df_dask) with pytest.raises(TypeError, match="iloc is not supported for Dask Series"): _iLocIndexer(sample_series_dask) def test_iLocIndexer_class(sample_df): if dd and isinstance(sample_df, dd.DataFrame): pytest.xfail('iloc is not supported with Dask inputs') sample_df.ww.init() ind = _iLocIndexer(sample_df)
def test_import_or_none(): assert import_or_none('pandas') == pd assert import_or_none('nonexistent') is None
from woodwork.utils import import_or_none UNSUPPORTED_SPARK_DTYPES = [ "int32", "intp", "uint8", "uint16", "uint32", "uint64", "uintp", "float_", "object", "category", ] ps = import_or_none("pyspark.pandas") def get_spark_dtypes(dtypes): return [dtype for dtype in dtypes if dtype not in UNSUPPORTED_SPARK_DTYPES] def test_integer_inference(integers): dtypes = ["int8", "int16", "int32", "int64", "intp", "int", "Int64"] if _is_spark_series(integers[0]): dtypes = get_spark_dtypes(dtypes) for series in integers: for dtype in dtypes: inferred_type = ww.type_system.infer_logical_type(series.astype(dtype)) assert isinstance(inferred_type, Integer)
from woodwork.accessor_utils import _is_dask_dataframe, _is_dask_series from woodwork.exceptions import WoodworkNotInitError from woodwork.indexers import _iLocIndexer, _locIndexer from woodwork.logical_types import ( Categorical, Datetime, Double, EmailAddress, Integer, PhoneNumber, ) from woodwork.tests.testing_utils import to_pandas from woodwork.utils import import_or_none dd = import_or_none("dask.dataframe") def test_iLocIndexer_class_error(sample_df_dask, sample_series_dask): with pytest.raises(TypeError, match="iloc is not supported for Dask DataFrames"): _iLocIndexer(sample_df_dask) with pytest.raises(TypeError, match="iloc is not supported for Dask Series"): _iLocIndexer(sample_series_dask) def test_iLocIndexer_class(sample_df): if _is_dask_dataframe(sample_df): pytest.xfail("iloc is not supported with Dask inputs")