예제 #1
0
class Numeric(BaseIntent):
    """Defines a numeric column type."""

    confidence_computation = {
        MetricWrapper(num_valid): 0.3,
        MetricWrapper(unique_heur, invert=True): 0.2,
        MetricWrapper(is_numeric): 0.4,
        MetricWrapper(is_string, invert=True): 0.1,
    }

    def fit(self, X, y=None, **fit_params):
        """Empty fit.

        Args:
            X: The input data
            y: The response variable
            **fit_params: Additional parameters for the fit

        Returns:
            self

        """
        return self

    def transform(self, X, y=None):
        """Convert a column to a numeric form.

        Args:
            X: The input data
            y: The response variable

        Returns:
            A column with all rows converted to numbers.

        """
        return X.apply(pd.to_numeric, errors="coerce")

    @classmethod
    def column_summary(cls, df):  # noqa
        result = standard_col_summary(df)

        data_transformed = pd.to_numeric(df.iloc[:, 0], errors="coerce")
        invalid_pct = (
            data_transformed.isnull().sum() * 100.0 / result["count"] -
            result["nan_percent"])
        outliers = get_outliers(data_transformed, count=5).values.tolist()

        result.update([
            ("invalid_percent", invalid_pct),
            ("mean", float(data_transformed.mean())),
            ("std", float(data_transformed.std())),
            ("min", float(data_transformed.min())),
            ("25%", float(data_transformed.quantile(0.25))),
            ("50%", float(data_transformed.quantile(0.5))),
            ("75%", float(data_transformed.quantile(0.75))),
            ("max", float(data_transformed.max())),
            ("5_outliers", outliers),
        ])
        return result
예제 #2
0
def test_metric_invert(retval):
    """Test metric invert computation."""

    from foreshadow.metrics import MetricWrapper

    def test(X):
        return retval

    metric_wrapper = MetricWrapper(test, 0, invert=True)
    assert (1 - retval) == metric_wrapper.calculate([1, 2, 3])
예제 #3
0
def test_metric_default_return():
    """Test metric default return value when a function errors."""

    from foreshadow.metrics import MetricWrapper

    def test(X):
        raise Exception

    metric_wrapper = MetricWrapper(test, 0)
    assert 0 == metric_wrapper.calculate([1, 2, 3])
예제 #4
0
def test_metric_last_call(metric_fn, arg, kwargs):
    """Test arbitrary function reroutes from call to last_call

    Args:
        metric_fn: arbitrary metric function
        arg: arg to metric call
        kwargs: any kwargs to metric call

    """
    from foreshadow.metrics import MetricWrapper

    metric_wrapper = MetricWrapper(metric_fn)
    _ = metric_wrapper.calculate(arg, **kwargs)
    assert metric_wrapper.last_call() == 1
예제 #5
0
 def __init__(self):
     transformations = [drop_transform]
     super().__init__(
         transformations,
         confidence_computation={
             MetricWrapper(calculate_percentage_of_rows_matching_regex): 1
         },
     )
예제 #6
0
    def __init__(
        self,
        transformations,
        output_columns=None,
        confidence_computation=None,
        default=return_original_row,
        # cache_manager=None,
    ):
        """Construct any cleaner/flattener.

        Args:
            transformations: a callable that takes a string and returns a
                tuple with the length of the transformed characters and then
                transformed string.
            output_columns: If none, any lists returned by the transformations
                are assumed to be separate columns in the new DataFrame.
                Otherwise, pass the names for each desired output
                column to be used.
            confidence_computation: The dict of {metric: weight} for the
                subclass's metric computation. This implies an OVR model.
            default: Function that returns the default value for a row if
                the transformation failed. Accepts the row as input.

        Raises:
            ValueError: If not a list, int, or None specifying expected
                output columns.

        """
        if not isinstance(output_columns, (int, list, type(None))):
            raise ValueError("output columns not a valid type")

        self.default = default
        self.output_columns = output_columns
        self.transformations = transformations
        self.confidence_computation = {
            MetricWrapper(calculate_percentage_of_rows_matching_regex): 0.8,
            MetricWrapper(avg_col_regex): 0.2,
        }
        # self.confidence_computation = {regex_rows: 0.8, avg_col_regex: 0.2}
        # self.cache_manager = cache_manager
        if confidence_computation is not None:
            self.confidence_computation = confidence_computation
예제 #7
0
class Text(BaseIntent):
    """Defines a text column type."""

    confidence_computation = {
        MetricWrapper(num_valid): 0.2,
        MetricWrapper(unique_heur): 0.2,
        MetricWrapper(is_numeric, invert=True): 0.2,
        MetricWrapper(is_string): 0.2,
        MetricWrapper(has_long_text): 0.2,
    }

    def fit(self, X, y=None, **fit_params):
        """Empty fit.

        Args:
            X: The input data
            y: The response variable
            **fit_params: Additional parameters for the fit

        Returns:
            self

        """
        return self

    def transform(self, X, y=None):
        """Convert a column to a text form.

        Args:
            X: The input data
            y: The response variable

        Returns:
            A column with all rows converted to text.

        """
        return X.astype(str)

    @classmethod
    def column_summary(cls, df):  # noqa
        return standard_col_summary(df)
예제 #8
0
def test_metric_print(fn, regex):
    """Test metric prints correct/useful information about itself.

    Args:
        fn: function of metric that returns a string
        regex: useful information to check

    """
    from foreshadow.metrics import MetricWrapper

    metric_fn = MetricWrapper(lambda x: 1)
    assert re.search(regex, getattr(metric_fn, fn)())
예제 #9
0
class Categorical(BaseIntent):
    """Defines a categoric column type."""

    confidence_computation = {
        MetricWrapper(num_valid): 0.25,
        MetricWrapper(unique_heur): 0.65,
        MetricWrapper(is_numeric, invert=True): 0.1,
    }

    def fit(self, X, y=None, **fit_params):
        """Empty fit.

        Args:
            X: The input data
            y: The response variable
            **fit_params: Additional parameters for the fit

        Returns:
            self

        """
        return self

    def transform(self, X, y=None):
        """Pass-through transform.

        Args:
            X: The input data
            y: The response variable

        Returns:
            The input column

        """
        return X

    @classmethod
    def column_summary(cls, df):  # noqa
        return standard_col_summary(df)