Exemplo n.º 1
0
def test_column_cardinality_functions(cardinality_validator):
    profiler = UserConfigurableProfiler(cardinality_validator)
    # assert profiler.column_info.get("col_none").get("cardinality") == "NONE"
    assert profiler.column_info.get("col_one").get("cardinality") == "ONE"
    assert profiler.column_info.get("col_two").get("cardinality") == "TWO"
    assert profiler.column_info.get("col_very_few").get(
        "cardinality") == "VERY_FEW"
    assert profiler.column_info.get("col_few").get("cardinality") == "FEW"
    assert profiler.column_info.get("col_many").get("cardinality") == "MANY"
    assert profiler.column_info.get("col_very_many").get(
        "cardinality") == "VERY_MANY"

    cardinality_with_ten_num_and_no_pct = (
        OrderedProfilerCardinality.get_basic_column_cardinality(num_unique=10))
    assert cardinality_with_ten_num_and_no_pct.name == "VERY_FEW"

    cardinality_with_unique_pct_and_no_num = (
        OrderedProfilerCardinality.get_basic_column_cardinality(
            pct_unique=1.0))
    assert cardinality_with_unique_pct_and_no_num.name == "UNIQUE"

    cardinality_with_no_pct_and_no_num = (
        OrderedProfilerCardinality.get_basic_column_cardinality())
    assert cardinality_with_no_pct_and_no_num.name == "NONE"

    cardinality_with_large_pct_and_no_num = (
        OrderedProfilerCardinality.get_basic_column_cardinality(
            pct_unique=0.5))
    assert cardinality_with_large_pct_and_no_num.name == "NONE"
Exemplo n.º 2
0
    def _get_column_cardinality(self, dataset, column):
        """
        Determines the cardinality of a column using the get_basic_column_cardinality method from
        OrderedProfilerCardinality
        Args:
            dataset: A GE Dataset
            column: The column for which to get cardinality

        Returns:
            The cardinality of the specified column
        """
        num_unique = None
        pct_unique = None

        try:
            num_unique = dataset.expect_column_unique_value_count_to_be_between(
                column, None, None).result["observed_value"]
            pct_unique = (
                dataset.
                expect_column_proportion_of_unique_values_to_be_between(
                    column, None, None).result["observed_value"])
        except KeyError:  # if observed_value value is not set
            logger.error(
                "Failed to get cardinality of column {:s} - continuing...".
                format(column))
        # Previously, if we had 25 possible categories out of 1000 rows, this would comes up as many, because of its
        #  percentage, so it was tweaked here, but is still experimental.
        cardinality = OrderedProfilerCardinality.get_basic_column_cardinality(
            num_unique, pct_unique)

        return cardinality.name