def score_pipelines(pipelines, engine): futures = [] for pipeline in pipelines: futures.append(engine.submit_training_job(X=ww.DataTable(X), y=ww.DataColumn(y), automl_config=automl_data, pipeline=pipeline)) pipelines = [f.get_result() for f in futures] futures = [] for pipeline in pipelines: futures.append(engine.submit_scoring_job(X=ww.DataTable(X), y=ww.DataColumn(y), automl_config=automl_data, pipeline=pipeline, objectives=[automl_data.objective])) results = [f.get_result() for f in futures] return results
def test_compute_final_component_features_nonlinear(mock_en_predict, mock_rf_predict, mock_ohe, mock_imputer, example_graph, X_y_binary): X, y = X_y_binary mock_imputer.return_value = ww.DataTable(pd.DataFrame(X)) mock_ohe.return_value = ww.DataTable(pd.DataFrame(X)) mock_en_predict.return_value = ww.DataColumn(pd.Series(np.ones(X.shape[0]))) mock_rf_predict.return_value = ww.DataColumn(pd.Series(np.zeros(X.shape[0]))) X_expected = pd.DataFrame({'Random Forest': np.zeros(X.shape[0]), 'Elastic Net': np.ones(X.shape[0])}) component_graph = ComponentGraph(example_graph).instantiate({}) component_graph.fit(X, y) X_t = component_graph.compute_final_component_features(X) assert_frame_equal(X_expected, X_t.to_dataframe()) assert mock_imputer.call_count == 2 assert mock_ohe.call_count == 4
def test_delay_feature_transformer_woodwork_custom_overrides_returned_by_components(X_df, fit_transform): y = pd.Series([1, 2, 1]) override_types = [Integer, Double, Categorical, Datetime, Boolean] for logical_type in override_types: try: X = ww.DataTable(X_df, logical_types={0: logical_type}) except TypeError: continue dft = DelayedFeatureTransformer(max_delay=1, gap=11) if fit_transform: transformed = dft.fit_transform(X, y) else: dft.fit(X, y) transformed = dft.transform(X, y) assert isinstance(transformed, ww.DataTable) if logical_type in [Integer, Double, Categorical]: assert transformed.logical_types == {0: logical_type, '0_delay_1': Double, 'target_delay_0': Integer, 'target_delay_1': Double} else: assert transformed.logical_types == {0: logical_type, '0_delay_1': logical_type, 'target_delay_0': Integer, 'target_delay_1': Double}
def test_rename_column_names_to_numeric(): X = np.array([[1, 2], [3, 4]]) pd.testing.assert_frame_equal(_rename_column_names_to_numeric(X), pd.DataFrame(X)) X = pd.DataFrame({"<>": [1, 2], ">>": [2, 4]}) pd.testing.assert_frame_equal(_rename_column_names_to_numeric(X), pd.DataFrame({ 0: [1, 2], 1: [2, 4] })) X = ww.DataTable(pd.DataFrame({ "<>": [1, 2], ">>": [2, 4] }), logical_types={ "<>": "categorical", ">>": "categorical" }) X_renamed = _rename_column_names_to_numeric(X) X_expected = pd.DataFrame({ 0: pd.Series([1, 2], dtype="category"), 1: pd.Series([2, 4], dtype="category") }) pd.testing.assert_frame_equal(X_renamed.to_dataframe(), X_expected) assert X_renamed.logical_types == { 0: ww.logical_types.Categorical, 1: ww.logical_types.Categorical }
def test_binary_classification_predictions_thresholded_properly( mock_predict, mock_predict_proba, mock_obj_decision, mock_decode, X_y_binary, dummy_ts_binary_pipeline_class): mock_objs = [mock_decode, mock_predict] mock_decode.return_value = pd.Series([0, 1]) X, y = X_y_binary binary_pipeline = dummy_ts_binary_pipeline_class( parameters={ "Logistic Regression Classifier": { "n_jobs": 1 }, "pipeline": { "gap": 0, "max_delay": 0 } }) # test no objective passed and no custom threshold uses underlying estimator's predict method binary_pipeline.fit(X, y) binary_pipeline.predict(X, y) for mock_obj in mock_objs: mock_obj.assert_called() mock_obj.reset_mock() # test objective passed but no custom threshold uses underlying estimator's predict method binary_pipeline.predict(X, y, 'precision') for mock_obj in mock_objs: mock_obj.assert_called() mock_obj.reset_mock() mock_objs = [mock_decode, mock_predict_proba] # test custom threshold set but no objective passed mock_predict_proba.return_value = ww.DataTable( pd.DataFrame([[0.1, 0.2], [0.1, 0.2]])) binary_pipeline.threshold = 0.6 binary_pipeline._encoder.classes_ = [0, 1] binary_pipeline.predict(X, y) for mock_obj in mock_objs: mock_obj.assert_called() mock_obj.reset_mock() mock_obj_decision.assert_not_called() mock_predict.assert_not_called() # test custom threshold set but no objective passed binary_pipeline.threshold = 0.6 binary_pipeline.predict(X, y) for mock_obj in mock_objs: mock_obj.assert_called() mock_obj.reset_mock() mock_obj_decision.assert_not_called() mock_predict.assert_not_called() # test custom threshold set and objective passed binary_pipeline.threshold = 0.6 mock_obj_decision.return_value = pd.Series([1.]) binary_pipeline.predict(X, y, 'precision') for mock_obj in mock_objs: mock_obj.assert_called() mock_obj.reset_mock() mock_predict.assert_not_called() mock_obj_decision.assert_called()
def test_transform(X_y_binary, X_y_multi, X_y_regression): datasets = locals() for dataset in datasets.values(): X, y = dataset X_pd = pd.DataFrame(X) X_pd.columns = X_pd.columns.astype(str) es = ft.EntitySet() es = es.entity_from_dataframe(entity_id="X", dataframe=X_pd, index='index', make_index=True) matrix, features = ft.dfs(entityset=es, target_entity="X") feature = DFSTransformer() feature.fit(X) X_feature_matrix = feature.transform(X) pd.testing.assert_frame_equal(matrix, X_feature_matrix) assert features == feature.features feature.fit(X, y) feature.transform(X) X_ww = ww.DataTable(X_pd) feature.fit(X_ww) feature.transform(X_ww)
def test_outliers_data_check_input_formats(): outliers_check = OutliersDataCheck() # test empty pd.DataFrame assert outliers_check.validate(pd.DataFrame()) == {"warnings": [], "errors": []} # test np.array a = np.arange(10) * 0.01 data = np.tile(a, (100, 10)) X = pd.DataFrame(data=data) X.iloc[0, 3] = 1000 X.iloc[3, 25] = 1000 X.iloc[5, 55] = 10000 X.iloc[10, 72] = -1000 outliers_check = OutliersDataCheck() assert outliers_check.validate(X.to_numpy()) == { "warnings": [DataCheckWarning(message="Column(s) '3', '25', '55', '72' are likely to have outlier data.", data_check_name=outliers_data_check_name, message_code=DataCheckMessageCode.HAS_OUTLIERS, details={"columns": [3, 25, 55, 72]}).to_dict()], "errors": [] } # test Woodwork outliers_check = OutliersDataCheck() assert outliers_check.validate(ww.DataTable(X)) == { "warnings": [DataCheckWarning(message="Column(s) '3', '25', '55', '72' are likely to have outlier data.", data_check_name=outliers_data_check_name, message_code=DataCheckMessageCode.HAS_OUTLIERS, details={"columns": [3, 25, 55, 72]}).to_dict()], "errors": [] }
def test_highly_null_data_check_input_formats(): highly_null_check = HighlyNullDataCheck(pct_null_threshold=0.8) # test empty pd.DataFrame assert highly_null_check.validate(pd.DataFrame()) == {"warnings": [], "errors": [], "actions": []} # test Woodwork ww_input = ww.DataTable(pd.DataFrame([[None, None, None, None, 0], [None, None, None, "hi", 5]])) assert highly_null_check.validate(ww_input) == { "warnings": [DataCheckWarning(message="Column '0' is 80.0% or more null", data_check_name=highly_null_data_check_name, message_code=DataCheckMessageCode.HIGHLY_NULL, details={"column": 0}).to_dict(), DataCheckWarning(message="Column '1' is 80.0% or more null", data_check_name=highly_null_data_check_name, message_code=DataCheckMessageCode.HIGHLY_NULL, details={"column": 1}).to_dict(), DataCheckWarning(message="Column '2' is 80.0% or more null", data_check_name=highly_null_data_check_name, message_code=DataCheckMessageCode.HIGHLY_NULL, details={"column": 2}).to_dict()], "errors": [], "actions": [] } # test 2D list assert highly_null_check.validate([[None, None, None, None, 0], [None, None, None, "hi", 5]]) == { "warnings": [DataCheckWarning(message="Column '0' is 80.0% or more null", data_check_name=highly_null_data_check_name, message_code=DataCheckMessageCode.HIGHLY_NULL, details={"column": 0}).to_dict(), DataCheckWarning(message="Column '1' is 80.0% or more null", data_check_name=highly_null_data_check_name, message_code=DataCheckMessageCode.HIGHLY_NULL, details={"column": 1}).to_dict(), DataCheckWarning(message="Column '2' is 80.0% or more null", data_check_name=highly_null_data_check_name, message_code=DataCheckMessageCode.HIGHLY_NULL, details={"column": 2}).to_dict()], "errors": [], "actions": [] } # test np.array assert highly_null_check.validate(np.array([[None, None, None, None, 0], [None, None, None, "hi", 5]])) == { "warnings": [DataCheckWarning(message="Column '0' is 80.0% or more null", data_check_name=highly_null_data_check_name, message_code=DataCheckMessageCode.HIGHLY_NULL, details={"column": 0}).to_dict(), DataCheckWarning(message="Column '1' is 80.0% or more null", data_check_name=highly_null_data_check_name, message_code=DataCheckMessageCode.HIGHLY_NULL, details={"column": 1}).to_dict(), DataCheckWarning(message="Column '2' is 80.0% or more null", data_check_name=highly_null_data_check_name, message_code=DataCheckMessageCode.HIGHLY_NULL, details={"column": 2}).to_dict()], "errors": [], "actions": [] }
def test_feature_selectors_woodwork_custom_overrides_returned_by_components( X_df): rf_classifier, rf_regressor = make_rf_feature_selectors() y = pd.Series([1, 2, 1]) X_df['another column'] = pd.Series([1., 2., 3.], dtype="float") override_types = [Integer, Double, Boolean] for logical_type in override_types: try: X = ww.DataTable(X_df, logical_types={0: logical_type}) except TypeError: continue rf_classifier.fit(X, y) transformed = rf_classifier.transform(X, y) assert isinstance(transformed, ww.DataTable) assert transformed.logical_types == { 0: logical_type, 'another column': Double } rf_regressor.fit(X, y) transformed = rf_regressor.transform(X, y) assert isinstance(transformed, ww.DataTable) assert transformed.logical_types == { 0: logical_type, 'another column': Double }
def test_make_pipeline_only_text_columns(input_type, problem_type): X = pd.DataFrame({"text": ["string one", "the evalml team is full of wonderful people", "text for a column, this should be a text column!!", "text string", "hello world"], "another text": ["ladidididididida", "cats are great", "text for a column, this should be a text column!!", "text string", "goodbye world"]}) y = pd.Series([0, 0, 1, 1, 0]) if input_type == 'ww': X = ww.DataTable(X) y = ww.DataColumn(y) estimators = get_estimators(problem_type=problem_type) pipeline_class = _get_pipeline_base_class(problem_type) if problem_type == ProblemTypes.MULTICLASS: y = pd.Series([0, 2, 1, 2]) for estimator_class in estimators: if problem_type in estimator_class.supported_problem_types: pipeline = make_pipeline(X, y, estimator_class, problem_type) assert isinstance(pipeline, type(pipeline_class)) assert pipeline.custom_hyperparameters is None delayed_features = [] if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] standard_scaler = [] if estimator_class.model_family == ModelFamily.LINEAR_MODEL: standard_scaler = [StandardScaler] assert pipeline.component_graph == [TextFeaturizer] + delayed_features + standard_scaler + [estimator_class]
def test_compute_final_component_features_linear(mock_ohe, mock_imputer, X_y_binary): X, y = X_y_binary X = pd.DataFrame(X) X_expected = X.fillna(0) mock_imputer.return_value = ww.DataTable(X) mock_ohe.return_value = ww.DataTable(X_expected) component_list = ['Imputer', 'One Hot Encoder', 'Random Forest Classifier'] component_graph = ComponentGraph().from_list(component_list) component_graph.instantiate({}) component_graph.fit(X, y) X_t = component_graph.compute_final_component_features(X) assert_frame_equal(X_expected, X_t.to_dataframe()) assert mock_imputer.call_count == 2 assert mock_ohe.call_count == 2
def _rename_column_names_to_numeric(X, flatten_tuples=True): """Used in LightGBM and XGBoost estimator classes to rename column names when the input is a pd.DataFrame in case it has column names that contain symbols ([, ], <) that these estimators cannot natively handle. Arguments: X (pd.DataFrame): The input training data of shape [n_samples, n_features] flatten_tuples (bool): Whether to flatten MultiIndex or tuple column names. LightGBM cannot handle columns with tuple names. Returns: Transformed X where column names are renamed to numerical values """ if isinstance(X, (np.ndarray, list)): return pd.DataFrame(X) if isinstance(X, ww.DataTable): X_t = X.to_dataframe() else: X_t = X.copy() if flatten_tuples and (len(X_t.columns) > 0 and isinstance(X_t.columns, pd.MultiIndex)): flat_col_names = list(map(str, X_t.columns)) X_t.columns = flat_col_names rename_cols_dict = dict((str(col), col_num) for col_num, col in enumerate(list(X.columns))) else: rename_cols_dict = dict((col, col_num) for col_num, col in enumerate(list(X.columns))) X_renamed = X_t.rename(columns=rename_cols_dict) if isinstance(X, ww.DataTable): X_renamed = ww.DataTable(X_renamed) return X_renamed
def test_make_pipeline_all_nan_no_categoricals(input_type, problem_type): # testing that all_null column is not considered categorical X = pd.DataFrame({"all_null": [np.nan, np.nan, np.nan, np.nan, np.nan], "num": [1, 2, 3, 4, 5]}) y = pd.Series([0, 0, 1, 1, 0]) if input_type == 'ww': X = ww.DataTable(X) y = ww.DataColumn(y) estimators = get_estimators(problem_type=problem_type) pipeline_class = _get_pipeline_base_class(problem_type) if problem_type == ProblemTypes.MULTICLASS: y = pd.Series([0, 2, 1, 2]) for estimator_class in estimators: if problem_type in estimator_class.supported_problem_types: pipeline = make_pipeline(X, y, estimator_class, problem_type) assert isinstance(pipeline, type(pipeline_class)) assert pipeline.custom_hyperparameters is None if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] else: delayed_features = [] if estimator_class.model_family == ModelFamily.LINEAR_MODEL: estimator_components = [StandardScaler, estimator_class] elif estimator_class.model_family == ModelFamily.CATBOOST: estimator_components = [estimator_class] else: estimator_components = [estimator_class] assert pipeline.component_graph == [DropNullColumns, Imputer] + delayed_features + estimator_components
def test_partial_dependence_with_non_numeric_columns( data_type, linear_regression_pipeline_class, logistic_regression_binary_pipeline_class): X = pd.DataFrame({ 'numeric': [1, 2, 3, 0], 'also numeric': [2, 3, 4, 1], 'string': ['a', 'b', 'a', 'c'], 'also string': ['c', 'b', 'a', 'd'] }) if data_type == "ww": X = ww.DataTable(X) y = [0, 0.2, 1.4, 1] pipeline = linear_regression_pipeline_class( parameters={"Linear Regressor": { "n_jobs": 1 }}) pipeline.fit(X, y) part_dep = partial_dependence(pipeline, X, features='numeric') assert list(part_dep.columns) == ["feature_values", "partial_dependence"] assert len(part_dep["partial_dependence"]) == 4 assert len(part_dep["feature_values"]) == 4 assert not part_dep.isnull().any(axis=None) part_dep = partial_dependence(pipeline, X, features='string') assert list(part_dep.columns) == ["feature_values", "partial_dependence"] assert len(part_dep["partial_dependence"]) == 3 assert len(part_dep["feature_values"]) == 3 assert not part_dep.isnull().any(axis=None)
def test_make_pipeline_text_columns(input_type, problem_type): X = pd.DataFrame({"numerical": [1, 2, 3, 1, 2], "categorical": ["a", "b", "a", "c", "c"], "text": ["string one", "another", "text for a column, this should be a text column!!", "text string", "hello world"]}) y = pd.Series([0, 0, 1, 1, 0]) if input_type == 'ww': X = ww.DataTable(X) y = ww.DataColumn(y) estimators = get_estimators(problem_type=problem_type) pipeline_class = _get_pipeline_base_class(problem_type) if problem_type == ProblemTypes.MULTICLASS: y = pd.Series([0, 2, 1, 2]) for estimator_class in estimators: if problem_type in estimator_class.supported_problem_types: pipeline = make_pipeline(X, y, estimator_class, problem_type) assert isinstance(pipeline, type(pipeline_class)) assert pipeline.custom_hyperparameters is None if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] else: delayed_features = [] if estimator_class.model_family == ModelFamily.LINEAR_MODEL: estimator_components = [OneHotEncoder, StandardScaler, estimator_class] elif estimator_class.model_family == ModelFamily.CATBOOST: estimator_components = [estimator_class] else: estimator_components = [OneHotEncoder, estimator_class] assert pipeline.component_graph == [Imputer, TextFeaturizer] + delayed_features + estimator_components
def test_make_pipeline_datetime_no_categorical(input_type, problem_type): X = pd.DataFrame({"numerical": [1, 2, 3, 1, 2], "some dates": pd.date_range('2000-02-03', periods=5, freq='W')}) y = pd.Series([0, 1, 1, 0, 0]) if input_type == 'ww': X = ww.DataTable(X) y = ww.DataColumn(y) estimators = get_estimators(problem_type=problem_type) pipeline_class = _get_pipeline_base_class(problem_type) if problem_type == ProblemTypes.MULTICLASS: y = pd.Series([0, 2, 1, 2]) for estimator_class in estimators: if problem_type in estimator_class.supported_problem_types: pipeline = make_pipeline(X, y, estimator_class, problem_type) assert isinstance(pipeline, type(pipeline_class)) assert pipeline.custom_hyperparameters is None if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] else: delayed_features = [] if estimator_class.model_family == ModelFamily.LINEAR_MODEL: estimator_components = [StandardScaler, estimator_class] elif estimator_class.model_family == ModelFamily.CATBOOST: estimator_components = [estimator_class] else: estimator_components = [estimator_class] assert pipeline.component_graph == [Imputer, DateTimeFeaturizer] + delayed_features + estimator_components
def eval_pipelines(pipelines, engine): futures = [] for pipeline in pipelines: futures.append(engine.submit_evaluation_job(X=ww.DataTable(X), y=ww.DataColumn(y), automl_config=automl_data, pipeline=pipeline)) results = [f.get_result() for f in futures] return results
def test_make_pipeline_no_column_names(input_type, problem_type): X = pd.DataFrame([[1, "a", np.nan], [2, "b", np.nan], [5, "b", np.nan]]) y = pd.Series([0, 0, 1]) if input_type == 'ww': X = ww.DataTable(X) y = ww.DataColumn(y) estimators = get_estimators(problem_type=problem_type) pipeline_class = _get_pipeline_base_class(problem_type) if problem_type == ProblemTypes.MULTICLASS: y = pd.Series([0, 2, 1, 2]) for estimator_class in estimators: if problem_type in estimator_class.supported_problem_types: pipeline = make_pipeline(X, y, estimator_class, problem_type) assert isinstance(pipeline, type(pipeline_class)) assert pipeline.custom_hyperparameters is None if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] else: delayed_features = [] if estimator_class.model_family == ModelFamily.LINEAR_MODEL: estimator_components = [OneHotEncoder, StandardScaler, estimator_class] elif estimator_class.model_family == ModelFamily.CATBOOST: estimator_components = [estimator_class] else: estimator_components = [OneHotEncoder, estimator_class] assert pipeline.component_graph == [DropNullColumns, Imputer] + delayed_features + estimator_components
def test_read_csv_with_woodwork_params(sample_df_pandas, tmpdir): filepath = os.path.join(tmpdir, 'sample.csv') sample_df_pandas.to_csv(filepath, index=False) logical_types = { 'full_name': 'NaturalLanguage', 'phone_number': 'PhoneNumber' } semantic_tags = { 'age': ['tag1', 'tag2'], 'is_registered': ['tag3', 'tag4'] } dt_from_csv = ww.read_csv(filepath=filepath, index='id', time_index='signup_date', logical_types=logical_types, semantic_tags=semantic_tags) dt = ww.DataTable(sample_df_pandas, index='id', time_index='signup_date', logical_types=logical_types, semantic_tags=semantic_tags) assert isinstance(dt, ww.DataTable) assert dt_from_csv.logical_types == dt.logical_types assert dt_from_csv.semantic_tags == dt.semantic_tags pd.testing.assert_frame_equal(dt_from_csv.to_dataframe(), dt.to_dataframe())
def test_new_dt_including(sample_df_pandas): # more thorough testing for this exists in indexer testing and new_dt_from_cols testing dt = ww.DataTable(sample_df_pandas) new_dt = _new_dt_including(dt, sample_df_pandas.iloc[:, 1:4]) for col in new_dt.columns: assert new_dt.semantic_tags[col] == new_dt.semantic_tags[col] assert new_dt.logical_types[col] == new_dt.logical_types[col]
def test_class_imbalance_nonnumeric_balanced(input_type): X = pd.DataFrame() y_bools_balanced = pd.Series([True, True, True, False, False]) y_binary_balanced = pd.Series(["No", "Yes", "No", "Yes"]) y_multiclass_balanced = pd.Series([ "red", "green", "red", "red", "blue", "green", "red", "blue", "green", "red" ]) if input_type == "ww": X = ww.DataTable(X) y_bools_balanced = ww.DataColumn(y_bools_balanced) y_binary_balanced = ww.DataColumn(y_binary_balanced) y_multiclass_balanced = ww.DataColumn(y_multiclass_balanced) class_imbalance_check = ClassImbalanceDataCheck(num_cv_folds=1) assert class_imbalance_check.validate(X, y_multiclass_balanced) == { "warnings": [], "errors": [], "actions": [] } assert class_imbalance_check.validate(X, y_binary_balanced) == { "warnings": [], "errors": [], "actions": [] } assert class_imbalance_check.validate(X, y_multiclass_balanced) == { "warnings": [], "errors": [], "actions": [] }
def test_text_featurizer_woodwork_custom_overrides_returned_by_components( X_df): X_df = X_df.copy() X_df['text col'] = pd.Series([ 'this will be a natural language column because length', 'yay', 'hay' ], dtype="string") y = pd.Series([1, 2, 1]) override_types = [Integer, Double, Categorical, Boolean] tf = TextFeaturizer() for logical_type in override_types: try: X = ww.DataTable(X_df, logical_types={0: logical_type}) except TypeError: continue tf.fit(X) transformed = tf.transform(X, y) assert isinstance(transformed, ww.DataTable) assert transformed.logical_types == { 0: logical_type, 'LSA(text col)[0]': Double, 'LSA(text col)[1]': Double, 'DIVERSITY_SCORE(text col)': Double, 'MEAN_CHARACTERS_PER_WORD(text col)': Double, 'POLARITY_SCORE(text col)': Double }
def test_delay_feature_transformer_supports_custom_index(encode_X_as_str, encode_y_as_str, use_woodwork, delayed_features_data): X, y = delayed_features_data X, X_answer, y, y_answer = encode_X_y_as_strings(X, y, encode_X_as_str, encode_y_as_str) X.index = pd.RangeIndex(50, 81) X_answer.index = pd.RangeIndex(50, 81) y.index = pd.RangeIndex(50, 81) y_answer.index = pd.RangeIndex(50, 81) answer = pd.DataFrame({"feature": X.feature, "feature_delay_1": X_answer.feature.shift(1), "feature_delay_2": X_answer.feature.shift(2), "feature_delay_3": X_answer.feature.shift(3), "target_delay_0": y_answer, "target_delay_1": y_answer.shift(1), "target_delay_2": y_answer.shift(2), "target_delay_3": y_answer.shift(3)}, index=pd.RangeIndex(50, 81)) if use_woodwork: X = ww.DataTable(X) y = ww.DataColumn(y) pd.testing.assert_frame_equal(DelayedFeatureTransformer(max_delay=3, gap=7).fit_transform(X, y), answer) answer_only_y = pd.DataFrame({"target_delay_0": y_answer, "target_delay_1": y_answer.shift(1), "target_delay_2": y_answer.shift(2), "target_delay_3": y_answer.shift(3)}, index=pd.RangeIndex(50, 81)) pd.testing.assert_frame_equal(DelayedFeatureTransformer(max_delay=3, gap=7).fit_transform(X=None, y=y), answer_only_y)
def test_datetime_featurizer_woodwork_custom_overrides_returned_by_components(with_datetime_col, encode_as_categories, X_df): override_types = [Integer, Double, Categorical, NaturalLanguage, Datetime] if with_datetime_col: X_df['datetime col'] = pd.to_datetime(['20200101', '20200519', '20190607'], format='%Y%m%d') for logical_type in override_types: try: X = ww.DataTable(X_df.copy(), logical_types={0: logical_type}) except TypeError: continue datetime_transformer = DateTimeFeaturizer(encode_as_categories=encode_as_categories) datetime_transformer.fit(X) transformed = datetime_transformer.transform(X) assert isinstance(transformed, ww.DataTable) if with_datetime_col: if encode_as_categories: datetime_col_transformed = {'datetime col_year': Integer, 'datetime col_month': Categorical, 'datetime col_day_of_week': Categorical, 'datetime col_hour': Integer} else: datetime_col_transformed = {'datetime col_year': Integer, 'datetime col_month': Integer, 'datetime col_day_of_week': Integer, 'datetime col_hour': Integer} assert all(item in transformed.logical_types.items() for item in datetime_col_transformed.items()) if logical_type == Datetime: if encode_as_categories: col_transformed = {'0_year': Integer, '0_month': Categorical, '0_day_of_week': Categorical, '0_hour': Integer} else: col_transformed = {'0_year': Integer, '0_month': Integer, '0_day_of_week': Integer, '0_hour': Integer} assert all(item in transformed.logical_types.items() for item in col_transformed.items()) else: assert transformed.logical_types[0] == logical_type
def _retain_custom_types_and_initalize_woodwork(old_datatable, new_dataframe, ltypes_to_ignore=None): """ Helper method which will take an old Woodwork DataTable and a new pandas DataFrame and return a new DataTable that will try to retain as many logical types from the old DataTable that exist in the new pandas DataFrame as possible. Arguments: old_datatable (ww.DataTable): Woodwork DataTable to use new_dataframe (pd.DataFrame): Pandas data structure ltypes_to_ignore (list): List of Woodwork logical types to ignore. Columns from the old DataTable that have a logical type specified in this list will not have their logical types carried over to the new DataTable returned Returns: A new DataTable where any of the columns that exist in the old input DataTable and the new DataFrame try to retain the original logical type, if possible and not specified to be ignored. """ retained_logical_types = {} if ltypes_to_ignore is None: ltypes_to_ignore = [] col_intersection = set(old_datatable.columns).intersection( set(new_dataframe.columns)) logical_types = old_datatable.logical_types for col in col_intersection: if logical_types[col] in ltypes_to_ignore: continue if str(new_dataframe[col].dtype) != logical_types[col].pandas_dtype: try: new_dataframe[col].astype(logical_types[col].pandas_dtype) retained_logical_types[col] = old_datatable[col].logical_type except (ValueError, TypeError): pass return ww.DataTable(new_dataframe, logical_types=retained_logical_types)
def test_explain_predictions_best_worst_custom_metric(mock_make_table, output_format, answer): mock_make_table.return_value = "table goes here" if output_format == "text" else { "explanations": ["explanation_dictionary_goes_here"] } pipeline = MagicMock() pipeline.parameters = "Parameters go here" input_features = pd.DataFrame({"a": [5, 6]}) pipeline.problem_type = ProblemTypes.REGRESSION pipeline.name = "Test Pipeline Name" pipeline.compute_estimator_features.return_value = ww.DataTable( input_features) pipeline.predict.return_value = ww.DataColumn(pd.Series([2, 1])) y_true = pd.Series([3, 2]) def sum(y_true, y_pred): return y_pred + y_true best_worst_report = explain_predictions_best_worst( pipeline, input_features, y_true=y_true, num_to_explain=1, metric=sum, output_format=output_format) if output_format == "text": compare_two_tables(best_worst_report.splitlines(), regression_custom_metric_answer.splitlines()) else: assert best_worst_report == answer
def test_datetime_nan_check_input_formats(): dt_nan_check = DateTimeNaNDataCheck() # test empty pd.DataFrame assert dt_nan_check.validate(pd.DataFrame()) == { "warnings": [], "errors": [], "actions": [] } expected = { "warnings": [], "actions": [], "errors": [ DataCheckError( message= 'Input datetime column(s) (index) contains NaN values. Please impute NaN values or drop these rows or columns.', data_check_name=DateTimeNaNDataCheck.name, message_code=DataCheckMessageCode.DATETIME_HAS_NAN, details={ "columns": 'index' }).to_dict() ] } dates = np.arange(np.datetime64('2017-01-01'), np.datetime64('2017-01-08')) dates[0] = np.datetime64('NaT') # test Woodwork ww_input = ww.DataTable(pd.DataFrame(dates, columns=['index'])) assert dt_nan_check.validate(ww_input) == expected expected = { "warnings": [], "actions": [], "errors": [ DataCheckError( message= 'Input datetime column(s) (0) contains NaN values. Please impute NaN values or drop these rows or columns.', data_check_name=DateTimeNaNDataCheck.name, message_code=DataCheckMessageCode.DATETIME_HAS_NAN, details={ 'columns': '0' }).to_dict() ] } # test 2D list assert dt_nan_check.validate([ dates, np.arange(np.datetime64('2017-01-01'), np.datetime64('2017-01-08')) ]) == expected # test np.array assert dt_nan_check.validate( np.array([ dates, np.arange(np.datetime64('2017-01-01'), np.datetime64('2017-01-08')) ])) == expected
def test_default_data_checks_regression(input_type): X = pd.DataFrame({ 'lots_of_null': [None, None, None, None, "some data"], 'all_null': [None, None, None, None, None], 'also_all_null': [None, None, None, None, None], 'no_null': [1, 2, 3, 5, 5], 'id': [0, 1, 2, 3, 4], 'has_label_leakage': [100, 200, 100, 200, 100] }) y = pd.Series([0.3, 100.0, np.nan, 1.0, 0.2]) y_no_variance = pd.Series([5] * 5) if input_type == "ww": X = ww.DataTable(X) y = ww.DataColumn(y) y_no_variance = ww.DataColumn(y_no_variance) null_leakage = [ DataCheckWarning( message= "Column 'lots_of_null' is 95.0% or more correlated with the target", data_check_name="TargetLeakageDataCheck", message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": "lots_of_null" }).to_dict() ] data_checks = DefaultDataChecks( "regression", get_default_primary_search_objective("regression")) assert data_checks.validate(X, y) == { "warnings": messages[:3], "errors": messages[3:] } # Skip Invalid Target assert data_checks.validate(X, y_no_variance) == { "warnings": messages[:3] + null_leakage, "errors": messages[4:] + [ DataCheckError(message="Y has 1 unique value.", data_check_name="NoVarianceDataCheck", message_code=DataCheckMessageCode.NO_VARIANCE, details={ "column": "Y" }).to_dict() ] } data_checks = DataChecks( DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, { "InvalidTargetDataCheck": { "problem_type": "regression", "objective": get_default_primary_search_objective("regression") } }) assert data_checks.validate(X, y) == { "warnings": messages[:3], "errors": messages[3:] }
def test_default_data_checks_regression(input_type): X = pd.DataFrame({'lots_of_null': [None, None, None, None, "some data"], 'all_null': [None, None, None, None, None], 'also_all_null': [None, None, None, None, None], 'no_null': [1, 2, 3, 5, 5], 'id': [0, 1, 2, 3, 4], 'has_label_leakage': [100, 200, 100, 200, 100], 'natural_language_nan': [None, "string_that_is_long_enough_for_natural_language_1", "string_that_is_long_enough_for_natural_language_2", "string_that_is_long_enough_for_natural_language_3", "string_that_is_long_enough_for_natural_language_4"], 'nan_dt_col': pd.Series(pd.date_range('20200101', periods=5))}) X['nan_dt_col'][0] = None y = pd.Series([0.3, 100.0, np.nan, 1.0, 0.2]) y_no_variance = pd.Series([5] * 5) if input_type == "ww": X = ww.DataTable(X) y = ww.DataColumn(y) y_no_variance = ww.DataColumn(y_no_variance) null_leakage = [DataCheckWarning(message="Column 'lots_of_null' is 95.0% or more correlated with the target", data_check_name="TargetLeakageDataCheck", message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={"column": "lots_of_null"}).to_dict()] data_checks = DefaultDataChecks("regression", get_default_primary_search_objective("regression")) id_leakage_warning = [DataCheckWarning(message="Column 'id' is 95.0% or more correlated with the target", data_check_name="TargetLeakageDataCheck", message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={"column": "id"}).to_dict()] nan_dt_leakage_warning = [DataCheckWarning(message="Column 'nan_dt_col' is 95.0% or more correlated with the target", data_check_name="TargetLeakageDataCheck", message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={"column": "nan_dt_col"}).to_dict()] impute_action = DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, 'impute_strategy': 'mean'}).to_dict() nan_dt_action = DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'nan_dt_col'}).to_dict() expected_actions_with_drop_and_impute = expected_actions[:3] + [nan_dt_action, impute_action] + expected_actions[4:] assert data_checks.validate(X, y) == {"warnings": messages[:3] + id_leakage_warning + nan_dt_leakage_warning, "errors": messages[3:], "actions": expected_actions_with_drop_and_impute} # Skip Invalid Target assert data_checks.validate(X, y_no_variance) == { "warnings": messages[:3] + null_leakage, "errors": messages[4:7] + [DataCheckError(message="Y has 1 unique value.", data_check_name="NoVarianceDataCheck", message_code=DataCheckMessageCode.NO_VARIANCE, details={"column": "Y"}).to_dict()] + messages[7:], "actions": expected_actions[:3] + expected_actions[4:] } data_checks = DataChecks(DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, {"InvalidTargetDataCheck": {"problem_type": "regression", "objective": get_default_primary_search_objective("regression")}}) assert data_checks.validate(X, y) == {"warnings": messages[:3] + id_leakage_warning + nan_dt_leakage_warning, "errors": messages[3:], "actions": expected_actions_with_drop_and_impute}
def test_compute_final_component_features_single_component(mock_transform, X_y_binary): X, y = X_y_binary X = pd.DataFrame(X) mock_transform.return_value = ww.DataTable(X) component_graph = ComponentGraph({'Dummy Component': [DummyTransformer]}).instantiate({}) component_graph.fit(X, y) X_t = component_graph.compute_final_component_features(X) assert_frame_equal(X, X_t.to_dataframe())