def log_training_data(self, train_features, train_targets, overwrite=False): """ Associate training data with this model reference. .. versionchanged:: 0.14.4 Instead of uploading the data itself as a CSV artifact ``'train_data'``, this method now generates a histogram for internal use by our deployment data monitoring system. .. deprecated:: 0.18.0 This method is no longer supported. Please see our documentation for information about our platform's data monitoring features. Parameters ---------- train_features : pd.DataFrame pandas DataFrame representing features of the training data. train_targets : pd.DataFrame or pd.Series pandas DataFrame representing targets of the training data. overwrite : bool, default False Whether to allow overwriting existing training data. """ warnings.warn( "This method is no longer supported. Please see our documentation" " for information about our platform's data monitoring features", category=FutureWarning, ) if train_features.__class__.__name__ != "DataFrame": raise TypeError( "`train_features` must be a pandas DataFrame, not {}".format( type(train_features))) if train_targets.__class__.__name__ == "Series": train_targets = train_targets.to_frame() elif train_targets.__class__.__name__ != "DataFrame": raise TypeError( "`train_targets` must be a pandas DataFrame or Series, not {}". format(type(train_targets))) # check for overlapping column names common_column_names = set(train_features.columns) & set( train_targets.columns) if common_column_names: raise ValueError( "`train_features` and `train_targets` combined have overlapping column names;" " please ensure column names are unique") train_df = train_features.join(train_targets) histograms = _histogram_utils.calculate_histograms(train_df) response = _utils.make_request("PUT", self._histogram_endpoint, self._conn, json=histograms) _utils.raise_for_http_error(response)
def test_integration(self, experiment_run): np = pytest.importorskip("numpy") pd = pytest.importorskip("pandas") binary_col_name = 'binary col' discrete_col_name = 'discrete col' float_col_name = 'float col' df = pd.concat( objs=[ pd.Series([True] * 10 + [False] * 20, name=binary_col_name), pd.Series([0] * 5 + [1] * 10 + [2] * 15, name=discrete_col_name), pd.Series(range(30), name=float_col_name), ], axis='columns', ) histograms = _histogram_utils.calculate_histograms(df) experiment_run.log_training_data( df[[binary_col_name, discrete_col_name]], df[float_col_name]) endpoint = "{}://{}/api/v1/monitoring/data/references/{}".format( experiment_run._conn.scheme, experiment_run._conn.socket, experiment_run.id, ) response = _utils.make_request("GET", endpoint, experiment_run._conn) _utils.raise_for_http_error(response) retrieved_histograms = response.json() # features match features = histograms['features'] retrieved_features = retrieved_histograms['features'] assert set(features.keys()) == set(retrieved_features.keys()) # binary matches binary_hist = histograms['features'][binary_col_name]['histogram'][ 'binary'] retrieved_binary_hist = retrieved_histograms['features'][ binary_col_name]['histogram']['binary'] assert binary_hist['count'] == retrieved_binary_hist['count'] # discrete matches discrete_hist = histograms['features'][discrete_col_name]['histogram'][ 'discrete'] retrieved_discrete_hist = retrieved_histograms['features'][ discrete_col_name]['histogram']['discrete'] assert discrete_hist['bucket_values'] == retrieved_discrete_hist[ 'bucket_values'] assert discrete_hist['count'] == retrieved_discrete_hist['count'] # float matches float_hist = histograms['features'][float_col_name]['histogram'][ 'float'] retrieved_float_hist = retrieved_histograms['features'][ float_col_name]['histogram']['float'] assert all( np.isclose(float_hist['bucket_limits'], retrieved_float_hist['bucket_limits'])) assert float_hist['count'] == retrieved_float_hist['count']
def test_discrete(self): np = pytest.importorskip("numpy") pd = pytest.importorskip("pandas") num_rows = 90 df = pd.concat( objs=[ pd.Series(np.random.randint(6, 12, size=num_rows), name="A"), pd.Series(np.random.randint(-12, -6, size=num_rows), name="B"), pd.Series(np.random.randint(-3, 3, size=num_rows), name="C"), ], axis='columns', ) histograms = _histogram_utils.calculate_histograms(df) assert all(histogram['type'] == "discrete" for histogram in histograms['features'].values()) self.assert_histograms_match_dataframe(histograms, df)
def test_float(self): np = pytest.importorskip("numpy") pd = pytest.importorskip("pandas") num_rows = 90 df = pd.concat( objs=[ pd.Series(np.random.normal(loc=9, size=num_rows), name="A"), pd.Series(np.random.normal(scale=12, size=num_rows), name="B"), pd.Series(np.random.normal(loc=-3, scale=6, size=num_rows), name="C"), ], axis='columns', ) histograms = _histogram_utils.calculate_histograms(df) assert all(histogram['type'] == "float" for histogram in histograms['features'].values()) self.assert_histograms_match_dataframe(histograms, df)
def test_binary(self): np = pytest.importorskip("numpy") pd = pytest.importorskip("pandas") num_rows = 90 df = pd.concat( objs=[ pd.Series(np.random.random(size=num_rows).round().astype(bool), name="A"), pd.Series(np.random.random(size=num_rows).round().astype(bool), name="B"), pd.Series(np.random.random(size=num_rows).round().astype(bool), name="C"), ], axis='columns', ) histograms = _histogram_utils.calculate_histograms(df) assert all(histogram['type'] == "binary" for histogram in histograms['features'].values()) self.assert_histograms_match_dataframe(histograms, df)