def representative_importance_matrix(repr_train, factor_train, repr_test, factor_test, random_state=1234, algo=GradientBoostingClassifier): r""" Using Tree Classifier to estimate the importance of each representation for each factor. Arguments: repr_train, repr_test : a Matrix `(n_samples, n_features)` input features for training the classifier factor_train, factor_test : a Matrix `(n_samples, n_factors)` discrete labels for the classifier algo : `sklearn.Estimator`, a classifier with `feature_importances_` attribute, for example: averaging methods: - `sklearn.ensemble.ExtraTreesClassifier` - `sklearn.ensemble.RandomForestClassifier` - `sklearn.ensemble.IsolationForest` and boosting methods: - `sklearn.ensemble.GradientBoostingClassifier` - `sklearn.ensemble.AdaBoostClassifier` Return: importance_matrix : a Matrix of shape `(n_features, n_factors)` train accuracy : a Scalar test accuracy : a Scalar """ num_latents = repr_train.shape[1] num_factors = factor_train.shape[1] assert hasattr(algo, 'feature_importances_'), \ "The class must contain 'feature_importances_' attribute" def _train(factor_idx): model = algo(random_state=random_state, n_iter_no_change=100) model.fit(np.asarray(repr_train), np.asarray(factor_train[:, factor_idx])) feat = np.abs(model.feature_importances_) train = np.mean( model.predict(repr_train) == factor_train[:, factor_idx]) test = np.mean(model.predict(repr_test) == factor_test[:, factor_idx]) return factor_idx, feat, train, test # ====== compute importance based on gradient boosted trees ====== # importance_matrix = np.zeros(shape=[num_latents, num_factors], dtype=np.float64) train_acc = list(range(num_factors)) test_acc = list(range(num_factors)) ncpu = min(max(1, get_cpu_count() - 1), 10) for factor_idx in range(num_factors): i, feat, train, test = _train(factor_idx) importance_matrix[:, i] = feat train_acc[i] = train test_acc[i] = test return importance_matrix, train_acc, test_acc
def mutual_info_estimate(representations, factors, continuous_representations=True, continuous_factors=False, n_neighbors=3, random_state=1234): r""" Nonparametric method for estimating entropy from k-nearest neighbors distances (note: this implementation use multi-processing) Return: matrix `[num_latents, num_factors]`, estimated mutual information between each representation and each factors References: A. Kraskov, H. Stogbauer and P. Grassberger, “Estimating mutual information”. Phys. Rev. E 69, 2004. B. C. Ross “Mutual Information between Discrete and Continuous Data Sets”. PLoS ONE 9(2), 2014. L. F. Kozachenko, N. N. Leonenko, “Sample Estimate of the Entropy of a Random Vector:, Probl. Peredachi Inf., 23:2 (1987), 9-16 """ from sklearn.feature_selection import (mutual_info_classif, mutual_info_regression) mutual_info = mutual_info_regression if continuous_factors else \ mutual_info_classif num_latents = representations.shape[1] num_factors = factors.shape[1] # iterate over each factor mi_matrix = np.empty(shape=(num_latents, num_factors), dtype=np.float64) # repeat for each factor def func(idx): mi = mutual_info(representations, factors[:, idx], discrete_features=not continuous_representations, n_neighbors=n_neighbors, random_state=random_state) return idx, mi for i, mi in MPI(jobs=list(range(num_factors)), func=func, ncpu=min(max(1, get_cpu_count() - 1), 10), batch=1): mi_matrix[:, i] = mi return mi_matrix