Exemplo n.º 1
0
def _estimate_entropy(x: np.ndarray, epsilon: np.ndarray) -> float:
    """Estimate dataset entropy."""
    x = asarray2d(x)
    n, d = x.shape

    # not enough data
    if n <= 1 or d == 0:
        return 0

    disc_mask = _get_disc_columns(x)
    cont_mask = ~disc_mask

    # if all columns are disc, use discrete-specific estimator
    if np.all(disc_mask):
        return _estimate_disc_entropy(x)

    # if all columns are cont, use continuous-specific estimator
    if np.all(cont_mask):
        return _estimate_cont_entropy(x, epsilon)

    # Separate the dataset into discrete and continuous datasets disc and cont
    disc = asarray2d(x[:, disc_mask])
    cont = asarray2d(x[:, cont_mask])

    # H(c|d)
    H_c_d = _estimate_conditional_entropy(cont, disc, epsilon)

    # H(d)
    H_d = _estimate_disc_entropy(disc)

    return H_d + H_c_d
Exemplo n.º 2
0
def test_asarray2d_series():
    # case: pd.Series
    a = np.zeros((3, ))
    ser = pd.Series(a)
    result = asarray2d(ser)
    assert result.shape[1] >= 1
    assert_array_equal(result, asarray2d(a))
Exemplo n.º 3
0
def _estimate_disc_entropy(x: np.ndarray) -> float:
    r"""Estimate the Shannon entropy of a discrete dataset.

    The Shannon entropy of a discrete random variable :math:`Z` with support
    :math:`\mathbb{Z}` and density :math:`P_Z` is given as

    .. math::
        H(Z) = -\sum_{z \in \mathbb{Z}} P_Z(z) \log(P_Z(z))

    Here, since we do not know :math:`P_Z`, we estimate :math:`\hat{P}_Z`, the
    empirical probability, calculated as the frequency in the dataset x.

    If x's columns logically represent continuous features, it is better to use
    the `_estimate_cont_entropy` function. If you are unsure of which to use,
    `estimate_entropy` can take datasets of mixed discrete and continuous
    functions.

    Args:
        x: Dataset with shape (n_samples, n_features) or
            (n_samples, )

    Returns:
        the dataset entropy.
    """
    x = asarray2d(x)
    pk, _ = _compute_empirical_probability(x)
    return scipy.stats.entropy(pk)
Exemplo n.º 4
0
def test_asarray2d_shape_n():
    # case: second dimension not present
    a = np.zeros((3, ))
    result = asarray2d(a)
    expected_shape = (3, 1)
    assert result.shape == expected_shape
    assert_array_equal(np.ravel(result), a)
Exemplo n.º 5
0
 def judge(self):
     logger.info(f'Judging feature using {self}')
     z = (self.candidate_feature.as_feature_engineering_pipeline().fit(
         self.X_df, y=self.y_df).transform(self.X_df_val))
     y = self.y_val
     z, y = asarray2d(z), asarray2d(y)
     z, y = self._handle_nans(z, y)
     if z is None and y is None:
         # nans were found and handle_nan_targets == 'fail'
         return False
     mi = estimate_mutual_information(z, y)
     delta = mi - self.threshold
     outcome = delta > 0
     logger.info(f'Mutual information with target I(Z;Y) is {mi} vs. '
                 f'threshold {self.threshold} ({delta} above threshold)')
     return outcome
Exemplo n.º 6
0
def test_asarray2d_df():
    # case: pd.DataFrame
    a = np.zeros((3, 2))
    df = pd.DataFrame(a)
    result = asarray2d(df)
    assert result.shape == df.shape
    assert result.shape[1] >= 1
    assert_array_equal(result, a)
Exemplo n.º 7
0
    def _test_robust_transformer(self,
                                 input_types,
                                 bad_input_checks,
                                 catches,
                                 transformer_maker=FragileTransformer):
        fragile_transformer = transformer_maker(bad_input_checks, catches)
        robust_transformer = DelegatingRobustTransformer(
            transformer_maker(bad_input_checks, catches))

        for input_type in input_types:
            X, y = self.d[input_type]
            # fragile transformer raises error
            with self.assertRaises(catches):
                fragile_transformer.fit_transform(X, y)
            # robust transformer does not raise error
            X_robust = robust_transformer.fit_transform(X, y)
            self.assertTrue(np.array_equal(asarray2d(X), asarray2d(X_robust)))
Exemplo n.º 8
0
def _concat_datasets(dfs_by_src, n_samples=0, omit=None):
    if omit is None:
        omit = []
    filtered_dfs = [
        np.array(dfs_by_src[x]) for x in dfs_by_src if x not in omit
    ]
    if len(filtered_dfs) == 0:
        return np.zeros((n_samples, 1))
    return asarray2d(np.concatenate(filtered_dfs, axis=1))
Exemplo n.º 9
0
 def __init__(self, *args, lmbda_1=0., lmbda_2=0.):
     super().__init__(*args)
     self.y = asarray2d(self.y)
     if lmbda_1 <= 0:
         lmbda_1 = estimate_entropy(self.y) / LAMBDA_1_ADJUSTMENT
     if lmbda_2 <= 0:
         lmbda_2 = estimate_entropy(self.y) / LAMBDA_2_ADJUSTMENT
     self.lmbda_1 = lmbda_1
     self.lmbda_2 = lmbda_2
Exemplo n.º 10
0
def test_cont_disc_entropy_differs_cont():
    """Expect cont, disc columns to have different entropy"""
    cont = asarray2d(np.arange(50)) + 0.5
    epsilon = _compute_epsilon(cont)

    H_cont = _estimate_cont_entropy(cont, epsilon)
    H_disc = _estimate_disc_entropy(cont)

    assert H_cont != H_disc
Exemplo n.º 11
0
def test_robust_transformer(
    input_types,
    bad_input_checks,
    catches,
    transformer_maker,
    sample_data,
):
    fragile_transformer = transformer_maker(bad_input_checks, catches)
    robust_transformer = DelegatingRobustTransformer(
        transformer_maker(bad_input_checks, catches))

    for input_type in input_types:
        X, y = sample_data[input_type]
        # fragile transformer raises error
        with pytest.raises(catches):
            fragile_transformer.fit_transform(X, y)
        # robust transformer does not raise error
        X_robust = robust_transformer.fit_transform(X, y)
        assert np.array_equal(asarray2d(X), asarray2d(X_robust))
Exemplo n.º 12
0
def test_entropy_multiple_disc():
    same_val_arr_zero = np.zeros((50, 1))
    same_val_arr_ones = np.ones((50, 1))
    # The 0.5 forces float => classified as continuous
    cont_val_arange = asarray2d(np.arange(50) + 0.5)
    all_disc_arr = np.concatenate((same_val_arr_ones, same_val_arr_zero),
                                  axis=1)
    mixed_val_arr = np.concatenate((all_disc_arr, cont_val_arange), axis=1)

    all_disc_h = estimate_entropy(all_disc_arr)
    mixed_h = estimate_entropy(mixed_val_arr)
    assert mixed_h > all_disc_h, \
        'Expected adding continuous column increases entropy'
Exemplo n.º 13
0
def test_cont_disc_entropy_differs_disc(get_disc_columns):
    """Expect cont, disc columns to have different entropy"""
    disc = asarray2d(np.arange(50))

    # we run into trouble here because as disc as *actually* discrete,
    # epsilon would not be calculated (it is set to some dummy value of
    # -inf). instead, we patch get_disc_columns and "force" epsilon to be
    # calculated
    epsilon = _compute_epsilon(disc)
    H_cont = _estimate_cont_entropy(disc, epsilon)

    H_disc = _estimate_disc_entropy(disc)

    assert H_cont != H_disc
Exemplo n.º 14
0
 def __init__(self,
              *args,
              lmbda_1: float = 0.0,
              lmbda_2: float = 0.0,
              lambda_1_adjustment: float = LAMBDA_1_ADJUSTMENT,
              lambda_2_adjustment: float = LAMBDA_2_ADJUSTMENT):
     super().__init__(*args)
     self.y_val = asarray2d(self.y_val)
     if lmbda_1 <= 0:
         lmbda_1 = estimate_entropy(self.y_val) / lambda_1_adjustment
     if lmbda_2 <= 0:
         lmbda_2 = estimate_entropy(self.y_val) / lambda_2_adjustment
     self.lmbda_1 = lmbda_1
     self.lmbda_2 = lmbda_2
Exemplo n.º 15
0
def _concat_datasets(feature_df_map: Dict[Feature, pd.DataFrame],
                     n_samples: int = 0,
                     omit: Optional[List[Feature]] = None) -> np.ndarray:
    if omit is None:
        omit = []

    filtered_dfs = [
        np.array(feature_df_map[feature]) for feature in feature_df_map
        if feature not in omit
    ]

    if not filtered_dfs:
        return np.zeros((n_samples, 1))

    return asarray2d(np.concatenate(filtered_dfs, axis=1))
Exemplo n.º 16
0
def _compute_empirical_probability(x):
    """Compute empirical probability of events in x

    Args:
        x: array-like

    Returns:
        pk: array-like of shape (K,) where where p[k] is the probability of
            event k
        events: array-like of shape (K, m) where each event is a vector of
            length m and there are K unique events
    """
    x = asarray2d(x)
    n, _ = x.shape
    events, counts = np.unique(x, axis=0, return_counts=True)
    pk = counts * 1.0 / n
    return pk, events
Exemplo n.º 17
0
def estimate_entropy(x):
    r"""Estimate dataset entropy.

    This function can take datasets of mixed discrete and continuous features,
    and uses a set of heuristics to determine which functions to apply to
    each. Discrete (Shannon) entropy is estimated via the empirical
    probability mass function. Continuous (differential) entropy is
    estimated via the KSG estimator [1].

    Let x be made of continuous features c and discrete features d.
    To deal with both continuous and discrete features, We use the
    following reworking of entropy:

    .. math::
       :nowrap:

       \begin{align}
       H(x) &= H(c,d) \\
            &= H(d) + H(c | d) \\
            &= \sum_{x \in d} p(x) H(c(x)) + H(d),
       \end{align}

    where :math:`c(x)` is a dataset that represents the rows of the continuous
    dataset in the same row as a discrete column with value x in the original
    dataset.

    Args:
        x (array-like): Dataset with shape (n_samples, n_features) or
            (n_samples, )
        epsilon (array-like): An array with shape (n_samples, 1) that is
            the epsilon used in KSG Estimator. Represents the chebyshev
            distance from an element to its k-th nearest neighbor in the full
            dataset.

    Returns:
        float: Dataset entropy of X.

    References:

    .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
           information". Phys. Rev. E 69, 2004.
    """
    x = asarray2d(x)
    epsilon = _compute_epsilon(x)
    return _estimate_entropy(x, epsilon)
Exemplo n.º 18
0
def _compute_epsilon(x: np.ndarray) -> np.ndarray:
    """Calculate epsilon from KSG Estimator

    Represents twice the distance of each element to its k-th nearest neighbor.

    Args:
        x: An array with shape (n_samples, n_features)

    Returns:
        An array with shape (n_samples, 1) representing
            epsilon as described above.

    References:

    .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
           information". Phys. Rev. E 69, 2004.

    """
    k = N_NEIGHBORS
    n = x.shape[0]

    disc_mask = _get_disc_columns(x)
    if np.all(disc_mask):
        # if no continuous columns, there's no point getting epsilon
        return np.full((n, 1), -np.inf)
    c = x[:, ~disc_mask]

    nn = _make_neighbors(n_neighbors=k)
    nn.fit(c)
    distances = np.zeros(n)

    # if the kth neighbor is at distance 0, then we are in trouble
    # but we can try the trick of increasing k if we don't use the old
    # value of k sometime later
    #
    # we aim to make this safer by deciding that columns with many repeated
    # values are discrete, not continuous (see _is_disc_column). we could also
    # add a small amount of noise to the whole column, or try something else
    # entirely.
    while not np.all(distances) and k < n:
        # distances to k-nearest neighbor
        distances = nn.kneighbors(n_neighbors=k)[0][:, -1]
        k += 1

    return asarray2d(2. * distances)
Exemplo n.º 19
0
def _estimate_cont_entropy(x: np.ndarray, epsilon: np.ndarray) -> float:
    """Estimate the differential entropy of a continuous dataset.

    Based off the KSG Estimator [1] for a dataset's differential entropy.
    If epsilon is provided, this is a partial estimation of the KSG entropy
    estimator. The bias is cancelled out when computing mutual information.

    The function relies on nonparametric methods based on entropy estimation
    from k-nearest neighbors distances as proposed in [1] and augmented in [2]
    for mutual information estimation.

    If X's columns logically represent discrete features, it is better to use
    the _estimate_disc_entropy function. If you are unsure of which to use,
    _estimate_entropy can take datasets of mixed discrete and continuous
    functions.

    Observe that differential entropy is *not* the "extension" of the
    Shannon entropy and thus it does not exhibit some properties like
    non-negativity (i.e. values below zero are possible).

    Args:
        x: Dataset with shape (n_samples, n_features) or
            (n_samples, )
        epsilon: An array with shape (n_samples, 1) that is
            the epsilon used in KSG Estimator. Represents the Chebyshev
            distance from an element to its k-th nearest neighbor in the full
            dataset.

    Returns:
        differential entropy of the dataset

    References:

    .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
           information". Phys. Rev. E 69, 2004.

    """
    x = asarray2d(x)
    n, d = x.shape
    nx = _compute_n_points_within_radius(x, epsilon / 2.0)
    c_d = _compute_volume_unit_ball(d)
    return -np.mean(digamma(nx + 1)) + digamma(n) + np.log(c_d) \
        + d * np.mean(np.log(epsilon))
Exemplo n.º 20
0
def _compute_epsilon(x):
    """Calculate epsilon from KSG Estimator

    Represents twice the distance of each element to its k-th nearest neighbor.

    Args:
        x (array-like): An array with shape (n_samples, n_features)

    Returns:
        array-like: An array with shape (n_samples, 1) representing
            epsilon as described above.

    References:

    .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
           information". Phys. Rev. E 69, 2004.

    """
    k = N_NEIGHBORS
    n = x.shape[0]

    disc_mask = _get_disc_columns(x)
    if np.all(disc_mask):
        # if no continuous columns, there's no point getting epsilon
        return -np.inf
    c = x[:, ~disc_mask]

    nn = _make_neighbors(n_neighbors=k)
    nn.fit(c)
    distances = np.zeros(n)

    # if the kth neighbor is at distance 0, then we are in trouble
    # but we can try the trick of increasing k if we don't use the old
    # value of k sometime later
    while not np.all(distances) and k < n:
        distances, _ = nn.kneighbors(n_neighbors=k)
        distances = distances[:, -1]  # distances to k-nearest neighbor
        k += 1

    return asarray2d(2. * distances)
Exemplo n.º 21
0
 def get_feature_values(feature):
     return asarray2d(
         feature.as_feature_engineering_pipeline().fit_transform(
             X_df, y_df))
Exemplo n.º 22
0
def discover(
    features: List['ballet.feature.Feature'],
    X_df: Optional[pd.DataFrame],
    y_df: Optional[pd.DataFrame],
    y: Optional[np.ndarray],
    input: Optional[str] = None,
    primitive: Optional[str] = None,
    expensive_stats: bool = False,
) -> pd.DataFrame:
    """Discover existing features

    Display information about existing features including summary statistics on
    the development dataset.  If the feature extracts multiple feature values,
    then the summary statistics (e.g. mean, std, nunique) are computed for each
    feature value and then averaged. If the development dataset cannot be
    loaded, computation of summary statistics is skipped.

    The following information is shown:
    - name: the name of the feature
    - description: the description of the feature
    - input: the variables that are used as input to the feature
    - transformer: the transformer/transformer pipeline
    - output: the output columns of the feature (not usually specified)
    - author: the GitHub username of the feature's author
    - source: the fully-qualified name of the Python module that contains the
        feature
    - mutual_information: estimated mutual information between the feature (or
        averaged over feature values) and the target on the development
        dataset split
    - conditional_mutual_information: estimated conditional mutual information
        between the feature (or averaged over feature values) and the target
        conditional on all other features on the development dataset split
    - ninputs: the number of input columns to the feature
    - nvalues: the number of feature values this feature extracts (i.e. 1 for
        a scalar-valued feature and >1 for a vector-valued feature)
    - ncontinuous: the number of feature values this feature extracts that are
        continuous-valued
    - ndiscrete: the number of feature values this feature extracts that are
        discrete-valued
    - mean: mean of the feature on the development dataset split
    - std: standard deviation of the feature (or averaged over feature values)
        on the development dataset split
    - var: variance of the feature (or averaged over feature values) on the
        development dataset split
    - min: minimum of the feature on the development dataset split
    - median: median of the feature (or median over feature values) on the
        development dataset split
    - max: maximum of the feature on the development dataset split
    - nunique: number of unique values of the feature (or averaged over
        feature values) on the development dataset split

    The following query operators are supported:
    - input (str): filter to only features that have ``input`` in their input/
        list of inputs
    - primitive (str): filter to only features that use primitive
        ``primitive`` (i.e. a class with name ``primitive``) in the
        transformer/transformer pipeline

    For other queries, you should just use normal DataFrame indexing::

       >>> features_df[features_df['author'] == 'jane']
       >>> features_df[features_df['name'].str.contains('married')]
       >>> features_df[features_df['mutual_information'] > 0.05]
       >>> features_df[features_df['input'].apply(
               lambda input: 'A' in input and 'B' in input)]

    Returns:
        data frame with features on the row index and columns as described
        above
    """
    records = []

    if X_df is not None and y_df is not None and y is not None:

        @fy.ignore(Exception)
        def get_feature_values(feature):
            return asarray2d(
                feature.as_feature_engineering_pipeline().fit_transform(
                    X_df, y_df))

        values = {feature: get_feature_values(feature) for feature in features}
        y = asarray2d(y)
        summarize = fy.rpartial(_summarize_feature, values, y, expensive_stats)

    else:
        summarize = fy.rpartial(_summarize_feature, None, None,
                                expensive_stats)

    for feature in tqdm(features):
        if (input and isinstance(feature.input, Container)  # avoid callables
                and input not in feature.input and input != feature.input):
            continue
        if (primitive and primitive not in get_transformer_primitives(
                feature.transformer)):
            continue
        summary = summarize(feature)
        records.append(summary)

    return pd.DataFrame.from_records(records)
Exemplo n.º 23
0
def H(a):  # noqa
    return estimate_entropy(asarray2d(a))
Exemplo n.º 24
0
def test_is_column_cont(x, expected):
    x = asarray2d(x)
    result = _is_column_cont(x)
    assert result == expected
Exemplo n.º 25
0
 def add_noise(X):
     X = asarray2d(X)
     return X + np.random.normal(0, 0.5, X.shape)
Exemplo n.º 26
0
 def test_is_column_disc(self):
     x = asarray2d(np.arange(50))
     result = _is_column_disc(x)
     self.assertTrue(result)
Exemplo n.º 27
0
 def test_is_column_cont(self):
     x = asarray2d(np.random.rand(50))
     result = _is_column_cont(x)
     self.assertTrue(result)
Exemplo n.º 28
0
def test_asarray2d_shape_n_x_1():
    # case: second dimension == 1
    a = np.zeros((3, 1))
    result = asarray2d(a)
    assert_array_equal(result, a)