示例#1
0
def representative_importance_matrix(repr_train,
                                     factor_train,
                                     repr_test,
                                     factor_test,
                                     random_state=1234,
                                     algo=GradientBoostingClassifier):
    r""" Using Tree Classifier to estimate the importance of each
  representation for each factor.

  Arguments:
    repr_train, repr_test : a Matrix `(n_samples, n_features)`
      input features for training the classifier
    factor_train, factor_test : a Matrix `(n_samples, n_factors)`
      discrete labels for the classifier
    algo : `sklearn.Estimator`, a classifier with `feature_importances_`
      attribute, for example:
        averaging methods:
        - `sklearn.ensemble.ExtraTreesClassifier`
        - `sklearn.ensemble.RandomForestClassifier`
        - `sklearn.ensemble.IsolationForest`
        and boosting methods:
        - `sklearn.ensemble.GradientBoostingClassifier`
        - `sklearn.ensemble.AdaBoostClassifier`

  Return:
    importance_matrix : a Matrix of shape `(n_features, n_factors)`
    train accuracy : a Scalar
    test accuracy : a Scalar
  """
    num_latents = repr_train.shape[1]
    num_factors = factor_train.shape[1]
    assert hasattr(algo, 'feature_importances_'), \
      "The class must contain 'feature_importances_' attribute"

    def _train(factor_idx):
        model = algo(random_state=random_state, n_iter_no_change=100)
        model.fit(np.asarray(repr_train),
                  np.asarray(factor_train[:, factor_idx]))
        feat = np.abs(model.feature_importances_)
        train = np.mean(
            model.predict(repr_train) == factor_train[:, factor_idx])
        test = np.mean(model.predict(repr_test) == factor_test[:, factor_idx])
        return factor_idx, feat, train, test

    # ====== compute importance based on gradient boosted trees ====== #
    importance_matrix = np.zeros(shape=[num_latents, num_factors],
                                 dtype=np.float64)
    train_acc = list(range(num_factors))
    test_acc = list(range(num_factors))
    ncpu = min(max(1, get_cpu_count() - 1), 10)
    for factor_idx in range(num_factors):
        i, feat, train, test = _train(factor_idx)
        importance_matrix[:, i] = feat
        train_acc[i] = train
        test_acc[i] = test
    return importance_matrix, train_acc, test_acc
示例#2
0
def mutual_info_estimate(representations,
                         factors,
                         continuous_representations=True,
                         continuous_factors=False,
                         n_neighbors=3,
                         random_state=1234):
    r""" Nonparametric method for estimating entropy from k-nearest neighbors
  distances (note: this implementation use multi-processing)

  Return:
    matrix `[num_latents, num_factors]`, estimated mutual information between
      each representation and each factors

  References:
    A. Kraskov, H. Stogbauer and P. Grassberger, “Estimating mutual information”.
      Phys. Rev. E 69, 2004.
    B. C. Ross “Mutual Information between Discrete and Continuous Data Sets”.
      PLoS ONE 9(2), 2014.
    L. F. Kozachenko, N. N. Leonenko, “Sample Estimate of the Entropy of a
      Random Vector:, Probl. Peredachi Inf., 23:2 (1987), 9-16
  """
    from sklearn.feature_selection import (mutual_info_classif,
                                           mutual_info_regression)
    mutual_info = mutual_info_regression if continuous_factors else \
      mutual_info_classif
    num_latents = representations.shape[1]
    num_factors = factors.shape[1]
    # iterate over each factor
    mi_matrix = np.empty(shape=(num_latents, num_factors), dtype=np.float64)

    # repeat for each factor
    def func(idx):
        mi = mutual_info(representations,
                         factors[:, idx],
                         discrete_features=not continuous_representations,
                         n_neighbors=n_neighbors,
                         random_state=random_state)
        return idx, mi

    for i, mi in MPI(jobs=list(range(num_factors)),
                     func=func,
                     ncpu=min(max(1,
                                  get_cpu_count() - 1), 10),
                     batch=1):
        mi_matrix[:, i] = mi
    return mi_matrix