def test_estimate_segments(): df = pd.DataFrame({"target": ["hat", "jug", "hat"], "confidence": [1.2, 3.4, 4.5], "sentiment": ["happy", "sad", "sad"]}) res = _estimate_segments(df, target_field="confidence", max_segments=4) assert res == ["target"] res = _estimate_segments(df, target_field="confidence", max_segments=3) assert res == ["target"] res = _estimate_segments(df, target_field="confidence", max_segments=1) assert res == []
def estimate_segments( self, df: pd.DataFrame, name: str, target_field: str = None, max_segments: int = 30, dry_run: bool = False, ) -> Optional[Union[List[Dict], List[str]]]: """ Estimates the most important features and values on which to segment data profiling using entropy-based methods. :param df: the dataframe of data to profile :param name: name for discovery in the logger, automatically applied to loggers with same dataset_name :param target_field: target field (optional) :param max_segments: upper threshold for total combinations of segments, default 30 :param dry_run: run calculation but do not write results to metadata :return: a list of segmentation feature names """ segments = _estimate_segments(df=df, target_field=target_field, max_segments=max_segments) if not dry_run: self.metadata_writer.autosegmentation_write(name, segments) return segments
def test_estimate_segments_empty(): df = pd.DataFrame({"target": [], "confidence": [], "sentiment": []}) res = _estimate_segments(df, target_field="confidence", max_segments=4) assert res == []