Пример #1
0
    def log_training_data(self,
                          train_features,
                          train_targets,
                          overwrite=False):
        """
        Associate training data with this model reference.

        .. versionchanged:: 0.14.4
           Instead of uploading the data itself as a CSV artifact ``'train_data'``, this method now
           generates a histogram for internal use by our deployment data monitoring system.

        .. deprecated:: 0.18.0
            This method is no longer supported. Please see our documentation
            for information about our platform's data monitoring features.

        Parameters
        ----------
        train_features : pd.DataFrame
            pandas DataFrame representing features of the training data.
        train_targets : pd.DataFrame or pd.Series
            pandas DataFrame representing targets of the training data.
        overwrite : bool, default False
            Whether to allow overwriting existing training data.

        """
        warnings.warn(
            "This method is no longer supported. Please see our documentation"
            " for information about our platform's data monitoring features",
            category=FutureWarning,
        )

        if train_features.__class__.__name__ != "DataFrame":
            raise TypeError(
                "`train_features` must be a pandas DataFrame, not {}".format(
                    type(train_features)))
        if train_targets.__class__.__name__ == "Series":
            train_targets = train_targets.to_frame()
        elif train_targets.__class__.__name__ != "DataFrame":
            raise TypeError(
                "`train_targets` must be a pandas DataFrame or Series, not {}".
                format(type(train_targets)))

        # check for overlapping column names
        common_column_names = set(train_features.columns) & set(
            train_targets.columns)
        if common_column_names:
            raise ValueError(
                "`train_features` and `train_targets` combined have overlapping column names;"
                " please ensure column names are unique")

        train_df = train_features.join(train_targets)

        histograms = _histogram_utils.calculate_histograms(train_df)

        response = _utils.make_request("PUT",
                                       self._histogram_endpoint,
                                       self._conn,
                                       json=histograms)
        _utils.raise_for_http_error(response)
Пример #2
0
    def test_integration(self, experiment_run):
        np = pytest.importorskip("numpy")
        pd = pytest.importorskip("pandas")

        binary_col_name = 'binary col'
        discrete_col_name = 'discrete col'
        float_col_name = 'float col'
        df = pd.concat(
            objs=[
                pd.Series([True] * 10 + [False] * 20, name=binary_col_name),
                pd.Series([0] * 5 + [1] * 10 + [2] * 15,
                          name=discrete_col_name),
                pd.Series(range(30), name=float_col_name),
            ],
            axis='columns',
        )
        histograms = _histogram_utils.calculate_histograms(df)

        experiment_run.log_training_data(
            df[[binary_col_name, discrete_col_name]], df[float_col_name])
        endpoint = "{}://{}/api/v1/monitoring/data/references/{}".format(
            experiment_run._conn.scheme,
            experiment_run._conn.socket,
            experiment_run.id,
        )
        response = _utils.make_request("GET", endpoint, experiment_run._conn)
        _utils.raise_for_http_error(response)
        retrieved_histograms = response.json()

        # features match
        features = histograms['features']
        retrieved_features = retrieved_histograms['features']
        assert set(features.keys()) == set(retrieved_features.keys())

        # binary matches
        binary_hist = histograms['features'][binary_col_name]['histogram'][
            'binary']
        retrieved_binary_hist = retrieved_histograms['features'][
            binary_col_name]['histogram']['binary']
        assert binary_hist['count'] == retrieved_binary_hist['count']

        # discrete matches
        discrete_hist = histograms['features'][discrete_col_name]['histogram'][
            'discrete']
        retrieved_discrete_hist = retrieved_histograms['features'][
            discrete_col_name]['histogram']['discrete']
        assert discrete_hist['bucket_values'] == retrieved_discrete_hist[
            'bucket_values']
        assert discrete_hist['count'] == retrieved_discrete_hist['count']

        # float matches
        float_hist = histograms['features'][float_col_name]['histogram'][
            'float']
        retrieved_float_hist = retrieved_histograms['features'][
            float_col_name]['histogram']['float']
        assert all(
            np.isclose(float_hist['bucket_limits'],
                       retrieved_float_hist['bucket_limits']))
        assert float_hist['count'] == retrieved_float_hist['count']
Пример #3
0
    def test_discrete(self):
        np = pytest.importorskip("numpy")
        pd = pytest.importorskip("pandas")
        num_rows = 90

        df = pd.concat(
            objs=[
                pd.Series(np.random.randint(6, 12, size=num_rows), name="A"),
                pd.Series(np.random.randint(-12, -6, size=num_rows), name="B"),
                pd.Series(np.random.randint(-3, 3, size=num_rows), name="C"),
            ],
            axis='columns',
        )
        histograms = _histogram_utils.calculate_histograms(df)

        assert all(histogram['type'] == "discrete"
                   for histogram in histograms['features'].values())
        self.assert_histograms_match_dataframe(histograms, df)
Пример #4
0
    def test_float(self):
        np = pytest.importorskip("numpy")
        pd = pytest.importorskip("pandas")
        num_rows = 90

        df = pd.concat(
            objs=[
                pd.Series(np.random.normal(loc=9, size=num_rows), name="A"),
                pd.Series(np.random.normal(scale=12, size=num_rows), name="B"),
                pd.Series(np.random.normal(loc=-3, scale=6, size=num_rows),
                          name="C"),
            ],
            axis='columns',
        )
        histograms = _histogram_utils.calculate_histograms(df)

        assert all(histogram['type'] == "float"
                   for histogram in histograms['features'].values())
        self.assert_histograms_match_dataframe(histograms, df)
Пример #5
0
    def test_binary(self):
        np = pytest.importorskip("numpy")
        pd = pytest.importorskip("pandas")
        num_rows = 90

        df = pd.concat(
            objs=[
                pd.Series(np.random.random(size=num_rows).round().astype(bool),
                          name="A"),
                pd.Series(np.random.random(size=num_rows).round().astype(bool),
                          name="B"),
                pd.Series(np.random.random(size=num_rows).round().astype(bool),
                          name="C"),
            ],
            axis='columns',
        )
        histograms = _histogram_utils.calculate_histograms(df)

        assert all(histogram['type'] == "binary"
                   for histogram in histograms['features'].values())
        self.assert_histograms_match_dataframe(histograms, df)