예제 #1
0
def test_outliers_data_check_init():
    outliers_check = OutliersDataCheck()
    assert outliers_check.random_state.get_state()[0] == get_random_state(
        0).get_state()[0]

    outliers_check = OutliersDataCheck(random_state=2)
    assert outliers_check.random_state.get_state()[0] == get_random_state(
        2).get_state()[0]
예제 #2
0
 def predict(self, X):
     X = infer_feature_types(X)
     strategy = self.parameters["strategy"]
     if strategy == "mode":
         predictions = pd.Series([self._mode] * len(X))
     elif strategy == "random":
         predictions = get_random_state(self.random_seed).choice(
             self._classes, len(X))
     else:
         predictions = get_random_state(self.random_seed).choice(
             self._classes, len(X), p=self._percentage_freq)
     return infer_feature_types(predictions)
예제 #3
0
    def __init__(self, random_state=0):
        """Checks if there are any outliers in the input data.

        Arguments:
            random_state (int, np.random.RandomState): The random seed/state. Defaults to 0.
        """
        self.random_state = get_random_state(random_state)
예제 #4
0
def test_baseline_binary_random_weighted(X_y_binary):
    X, y = X_y_binary
    values, counts = np.unique(y, return_counts=True)
    percent_freq = counts.astype(float) / len(y)
    assert percent_freq.sum() == 1.0

    clf = BaselineClassifier(strategy="random_weighted", random_state=0)
    clf.fit(X, y)

    assert clf.classes_ == [0, 1]
    expected_predictions = pd.Series(get_random_state(0).choice(
        np.unique(y), len(X), p=percent_freq),
                                     dtype="Int64")
    predictions = clf.predict(X)
    assert_series_equal(expected_predictions, predictions.to_series())

    predicted_proba = clf.predict_proba(X)
    assert predicted_proba.shape == (len(X), 2)
    expected_predictions_proba = pd.DataFrame(
        np.array([[percent_freq[i] for i in range(len(values))]] * len(X)))
    assert_frame_equal(expected_predictions_proba,
                       predicted_proba.to_dataframe())

    np.testing.assert_allclose(clf.feature_importance,
                               np.array([0.0] * X.shape[1]))
def test_more_top_n_unique_values_large():
    X = pd.DataFrame({"col_1": ["a", "b", "c", "d", "e", "f", "g", "h", "i"],
                      "col_2": ["a", "a", "a", "b", "b", "c", "c", "d", "e"],
                      "col_3": ["a", "a", "a", "b", "b", "b", "c", "c", "d"],
                      "col_4": [2, 0, 1, 3, 0, 1, 2, 4, 1]})

    random_seed = 2
    test_random_state = get_random_state(random_seed)

    encoder = OneHotEncoder(top_n=3, random_state=random_seed)
    encoder.fit(X)
    X_t = encoder.transform(X)

    # Conversion changes the resulting dataframe dtype, resulting in a different random state, so we need make the conversion here too
    X = _convert_to_woodwork_structure(X)
    X = _convert_woodwork_types_wrapper(X.to_dataframe())
    col_1_counts = X["col_1"].value_counts(dropna=False).to_frame()
    col_1_counts = col_1_counts.sample(frac=1, random_state=test_random_state)
    col_1_counts = col_1_counts.sort_values(["col_1"], ascending=False, kind='mergesort')
    col_1_samples = col_1_counts.head(encoder.parameters['top_n']).index.tolist()
    expected_col_names = set(["col_2_a", "col_2_b", "col_2_c", "col_3_a", "col_3_b", "col_3_c", "col_4"])
    for val in col_1_samples:
        expected_col_names.add("col_1_" + val)

    col_names = set(X_t.columns)
    assert (col_names == expected_col_names)
예제 #6
0
def test_xgboost_feature_name_with_random_ascii(problem_type, X_y_binary,
                                                X_y_multi):
    clf = XGBoostClassifier()
    if problem_type == ProblemTypes.BINARY:
        X, y = X_y_binary
        expected_cols = 2

    elif problem_type == ProblemTypes.MULTICLASS:
        X, y = X_y_multi
        expected_cols = 3

    X = get_random_state(clf.random_state).random(
        (X.shape[0], len(string.printable)))
    col_names = [
        'column_{}'.format(ascii_char) for ascii_char in string.printable
    ]
    X = pd.DataFrame(X, columns=col_names)

    clf.fit(X, y)
    predictions = clf.predict(X)
    assert len(predictions) == len(y)
    assert not np.isnan(predictions.to_series()).all()

    predictions = clf.predict_proba(X)
    assert predictions.shape == (len(y), expected_cols)
    assert not np.isnan(predictions.to_dataframe()).all().all()

    assert len(clf.feature_importance) == len(X.columns)
    assert not np.isnan(clf.feature_importance).all().all()
예제 #7
0
def test_estimators_feature_name_with_random_ascii(X_y_binary, X_y_multi,
                                                   X_y_regression,
                                                   helper_functions):
    for estimator_class in _all_estimators_used_in_search():
        supported_problem_types = [
            handle_problem_types(pt)
            for pt in estimator_class.supported_problem_types
        ]
        for problem_type in supported_problem_types:
            clf = helper_functions.safe_init_component_with_njobs_1(
                estimator_class)
            if problem_type == ProblemTypes.BINARY:
                X, y = X_y_binary
            elif problem_type == ProblemTypes.MULTICLASS:
                X, y = X_y_multi
            elif problem_type == ProblemTypes.REGRESSION:
                X, y = X_y_regression

            X = get_random_state(clf.random_state).random(
                (X.shape[0], len(string.printable)))
            col_names = [
                'column_{}'.format(ascii_char)
                for ascii_char in string.printable
            ]
            X = pd.DataFrame(X, columns=col_names)
            clf.fit(X, y)
            assert len(clf.feature_importance) == len(X.columns)
            assert not np.isnan(clf.feature_importance).all().all()
            predictions = clf.predict(X).to_series()
            assert len(predictions) == len(y)
            assert not np.isnan(predictions).all()
예제 #8
0
    def __init__(self,
                 pipeline_hyperparameter_ranges,
                 random_seed=0,
                 with_replacement=False,
                 replacement_max_attempts=10):
        """ Sets up check for duplication if needed.

        Arguments:
            pipeline_hyperparameter_ranges (dict): a set of hyperparameter ranges corresponding to a pipeline's parameters
            random_state (int): Unused in this class. Defaults to 0.
            with_replacement (bool): If false, only unique hyperparameters will be shown
            replacement_max_attempts (int): The maximum number of tries to get a unique
                set of random parameters. Only used if tuner is initalized with
                with_replacement=True
            random_seed (int): Seed for random number generator. Defaults to 0.
        """
        super().__init__(pipeline_hyperparameter_ranges,
                         random_seed=random_seed)
        self._space = Space(self._search_space_ranges)
        self._random_state = get_random_state(random_seed)
        self._with_replacement = with_replacement
        self._replacement_max_attempts = replacement_max_attempts
        self._used_parameters = set()
        self._used_parameters.add(())
        self.curr_params = None
예제 #9
0
    def __init__(self, parameters, random_state=0):
        """Machine learning pipeline made out of transformers and a estimator.

        Required Class Variables:
            component_graph (list): List of components in order. Accepts strings or ComponentBase subclasses in the list

        Arguments:
            parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values.
                 An empty dictionary {} implies using all default values for component parameters.
            random_state (int, np.random.RandomState): The random seed/state. Defaults to 0.
        """
        self.random_state = get_random_state(random_state)
        if isinstance(self.component_graph, list):  # Backwards compatibility
            self._component_graph = ComponentGraph().from_list(self.component_graph, random_state=self.random_state)
        else:
            self._component_graph = ComponentGraph(component_dict=self.component_graph, random_state=self.random_state)
        self._component_graph.instantiate(parameters)

        self.input_feature_names = {}
        self.input_target_name = None

        final_component = self._component_graph.get_last_component()
        self.estimator = final_component if isinstance(final_component, Estimator) else None
        self._estimator_name = self._component_graph.compute_order[-1] if self.estimator is not None else None

        self._validate_estimator_problem_type()
        self._is_fitted = False
        self._pipeline_params = parameters.get("pipeline", {})
예제 #10
0
def test_estimators_feature_name_with_random_ascii(X_y_binary, X_y_multi,
                                                   X_y_regression, ts_data,
                                                   helper_functions):
    for estimator_class in _all_estimators_used_in_search():
        if estimator_class.__name__ == 'ARIMARegressor':
            continue
        supported_problem_types = [
            handle_problem_types(pt)
            for pt in estimator_class.supported_problem_types
        ]
        for problem_type in supported_problem_types:
            clf = helper_functions.safe_init_component_with_njobs_1(
                estimator_class)
            if is_binary(problem_type):
                X, y = X_y_binary
            elif is_multiclass(problem_type):
                X, y = X_y_multi
            elif is_regression(problem_type):
                X, y = X_y_regression

            X = get_random_state(clf.random_seed).random(
                (X.shape[0], len(string.printable)))
            col_names = [
                'column_{}'.format(ascii_char)
                for ascii_char in string.printable
            ]
            X = pd.DataFrame(X, columns=col_names)
            assert clf.input_feature_names is None
            clf.fit(X, y)
            assert len(clf.feature_importance) == len(X.columns)
            assert not np.isnan(clf.feature_importance).all().all()
            predictions = clf.predict(X).to_series()
            assert len(predictions) == len(y)
            assert not np.isnan(predictions).all()
            assert (clf.input_feature_names == col_names)
예제 #11
0
    def __init__(self,
                 allowed_pipelines=None,
                 max_iterations=None,
                 tuner_class=None,
                 random_state=0):
        """This class represents an automated machine learning (AutoML) algorithm. It encapsulates the decision-making logic behind an automl search, by both deciding which pipelines to evaluate next and by deciding what set of parameters to configure the pipeline with.

        To use this interface, you must define a next_batch method which returns the next group of pipelines to evaluate on the training data. That method may access state and results recorded from the previous batches, although that information is not tracked in a general way in this base class. Overriding add_result is a convenient way to record pipeline evaluation info if necessary.

        Arguments:
            allowed_pipelines (list(class)): A list of PipelineBase subclasses indicating the pipelines allowed in the search. The default of None indicates all pipelines for this problem type are allowed.
            max_iterations (int): The maximum number of iterations to be evaluated.
            tuner_class (class): A subclass of Tuner, to be used to find parameters for each pipeline. The default of None indicates the SKOptTuner will be used.
            random_state (int, np.random.RandomState): The random seed/state. Defaults to 0.
        """
        self.random_state = get_random_state(random_state)
        self.allowed_pipelines = allowed_pipelines or []
        self.max_iterations = max_iterations
        self._tuner_class = tuner_class or SKOptTuner
        self._tuners = {}
        for p in self.allowed_pipelines:
            self._tuners[p.name] = self._tuner_class(
                p.hyperparameters, random_state=self.random_state)
        self._pipeline_number = 0
        self._batch_number = 0
예제 #12
0
def test_ohe_top_n_categories_always_the_same():
    df = pd.DataFrame({"categories": ["cat_1"] * 5 + ["cat_2"] * 4 + ["cat_3"] * 3 + ["cat_4"] * 3 + ["cat_5"] * 3,
                       "numbers": range(18)})

    def check_df_equality(random_state):
        ohe = OneHotEncoder(top_n=4, random_state=random_state)
        df1 = ohe.fit_transform(df)
        df2 = ohe.fit_transform(df)
        pd.testing.assert_frame_equal(df1, df2)

    check_df_equality(5)
    check_df_equality(get_random_state(5))
예제 #13
0
def test_baseline_multiclass_random(X_y_multi):
    X, y = X_y_multi
    values = np.unique(y)
    clf = BaselineClassifier(strategy="random", random_seed=0)
    clf.fit(X, y)

    assert clf.classes_ == [0, 1, 2]
    expected_predictions = pd.Series(get_random_state(0).choice(np.unique(y), len(X)), dtype="Int64")
    predictions = clf.predict(X)
    assert_series_equal(expected_predictions, predictions.to_series())

    predicted_proba = clf.predict_proba(X)
    assert predicted_proba.shape == (len(X), 3)
    assert_frame_equal(pd.DataFrame(np.array([[1. / 3 for i in range(len(values))]] * len(X))), predicted_proba.to_dataframe())
    np.testing.assert_allclose(clf.feature_importance, np.array([0.0] * X.shape[1]))
예제 #14
0
def test_xgboost_feature_name_with_random_ascii(X_y_regression):
    X, y = X_y_regression
    clf = XGBoostRegressor()
    X = get_random_state(clf.random_state).random(
        (X.shape[0], len(string.printable)))
    col_names = [
        'column_{}'.format(ascii_char) for ascii_char in string.printable
    ]
    X = pd.DataFrame(X, columns=col_names)
    clf.fit(X, y)
    predictions = clf.predict(X)
    assert len(predictions) == len(y)
    assert not np.isnan(predictions.to_series()).all()

    assert len(clf.feature_importance) == len(X.columns)
    assert not np.isnan(clf.feature_importance).all().all()
def test_baseline_binary_random(X_y_binary):
    X, y = X_y_binary
    values = np.unique(y)
    clf = BaselineClassifier(strategy="random", random_state=0)
    clf.fit(X, y)
    assert clf.classes_ == [0, 1]
    np.testing.assert_allclose(
        clf.predict(X),
        get_random_state(0).choice(np.unique(y), len(X)))
    predicted_proba = clf.predict_proba(X)
    assert predicted_proba.shape == (len(X), 2)
    np.testing.assert_allclose(
        predicted_proba,
        np.array([[0.5 for i in range(len(values))]] * len(X)))
    np.testing.assert_allclose(clf.feature_importance,
                               np.array([0.0] * X.shape[1]))
예제 #16
0
def test_baseline_multi_random(X_y_multi):
    X, y = X_y_multi
    values = np.unique(y)
    parameters = {"Baseline Classifier": {"strategy": "random"}}
    clf = BaselineMulticlassPipeline(parameters=parameters)
    clf.fit(X, y)
    predicted_proba = clf.predict_proba(X)

    np.testing.assert_allclose(
        clf.predict(X),
        get_random_state(0).choice(np.unique(y), len(X)))
    assert predicted_proba.shape == (len(X), 3)
    np.testing.assert_allclose(
        predicted_proba,
        np.array([[1. / 3 for i in range(len(values))]] * len(X)))
    np.testing.assert_allclose(clf.feature_importance.iloc[:, 1],
                               np.array([0.0] * X.shape[1]))
def test_baseline_binary_random_weighted(X_y_binary):
    X, y = X_y_binary
    values, counts = np.unique(y, return_counts=True)
    percent_freq = counts.astype(float) / len(y)
    assert percent_freq.sum() == 1.0
    clf = BaselineClassifier(strategy="random_weighted", random_state=0)
    clf.fit(X, y)
    assert clf.classes_ == [0, 1]
    np.testing.assert_allclose(
        clf.predict(X),
        get_random_state(0).choice(np.unique(y), len(X), p=percent_freq))
    predicted_proba = clf.predict_proba(X)
    assert predicted_proba.shape == (len(X), 2)
    np.testing.assert_allclose(
        predicted_proba,
        np.array([[percent_freq[i] for i in range(len(values))]] * len(X)))
    np.testing.assert_allclose(clf.feature_importance,
                               np.array([0.0] * X.shape[1]))
예제 #18
0
    def __init__(self, component_dict=None, random_state=0):
        """ Initializes a component graph for a pipeline as a directed acyclic graph (DAG).

        Example:
            >>> component_dict = {'imputer': ['Imputer'], 'ohe': ['One Hot Encoder', 'imputer.x'], 'estimator_1': ['Random Forest Classifier', 'ohe.x'], 'estimator_2': ['Decision Tree Classifier', 'ohe.x'], 'final': ['Logistic Regression Classifier', 'estimator_1', 'estimator_2']}
            >>> component_graph = ComponentGraph(component_dict)
           """
        self.random_state = get_random_state(random_state)
        self.component_dict = component_dict or {}
        self.component_instances = {}
        self._is_instantiated = False
        for component_name, component_info in self.component_dict.items():
            if not isinstance(component_info, list):
                raise ValueError(
                    'All component information should be passed in as a list')
            component_class = handle_component_class(component_info[0])
            self.component_instances[component_name] = component_class
        self.compute_order = self.generate_order(self.component_dict)
        self.input_feature_names = {}
예제 #19
0
def test_baseline_multi_random_weighted(X_y_multi):
    X, y = X_y_multi
    values, counts = np.unique(y, return_counts=True)
    percent_freq = counts.astype(float) / len(y)
    assert percent_freq.sum() == 1.0

    parameters = {"Baseline Classifier": {"strategy": "random_weighted"}}
    clf = BaselineMulticlassPipeline(parameters=parameters)
    clf.fit(X, y)
    predicted_proba = clf.predict_proba(X)

    np.testing.assert_allclose(
        clf.predict(X),
        get_random_state(0).choice(np.unique(y), len(X), p=percent_freq))
    assert predicted_proba.shape == (len(X), 3)
    np.testing.assert_allclose(
        predicted_proba,
        np.array([[percent_freq[i] for i in range(len(values))]] * len(X)))
    np.testing.assert_allclose(clf.feature_importance.iloc[:, 1],
                               np.array([0.0] * X.shape[1]))
def test_baseline_multi_random(X_y_multi):
    X, y = X_y_multi
    values = np.unique(y)
    parameters = {"Baseline Classifier": {"strategy": "random"}}
    clf = BaselineMulticlassPipeline(parameters=parameters)
    clf.fit(X, y)

    expected_predictions = pd.Series(get_random_state(0).choice(
        np.unique(y), len(X)),
                                     dtype="Int64")
    assert_series_equal(expected_predictions, clf.predict(X).to_series())

    predicted_proba = clf.predict_proba(X)
    assert predicted_proba.shape == (len(X), 3)
    expected_predictions_proba = pd.DataFrame(
        np.array([[1. / 3 for i in range(len(values))]] * len(X)))
    assert_frame_equal(expected_predictions_proba,
                       predicted_proba.to_dataframe())
    np.testing.assert_allclose(clf.feature_importance.iloc[:, 1],
                               np.array([0.0] * X.shape[1]))
예제 #21
0
 def __init__(self, parameters=None, component_obj=None, random_state=0, **kwargs):
     self.random_state = get_random_state(random_state)
     self._component_obj = component_obj
     self._parameters = parameters or {}
     self._is_fitted = False