def test_calculate_all_metafeatures_multilabel(multilabel_train_data): meta_features.helper_functions.clear() X, y = multilabel_train_data categorical = {i: False for i in range(10)} mf = meta_features.calculate_all_metafeatures( X, y, categorical, "Generated", logger=logging.getLogger('TestMeta')) assert 52 == len(mf.metafeature_values)
def test_calculate_all_metafeatures(self): mf = meta_features.calculate_all_metafeatures(self.X, self.y, self.categorical, "2") self.assertEqual(52, len(mf.metafeature_values)) self.assertEqual( mf.metafeature_values['NumberOfCategoricalFeatures'].value, 32) sio = StringIO() mf.dump(sio)
def test_calculate_all_metafeatures(self): mf = meta_features.calculate_all_metafeatures( self.X, self.y, self.categorical, "2") self.assertEqual(52, len(mf.metafeature_values)) self.assertEqual(mf.metafeature_values[ 'NumberOfCategoricalFeatures'].value, 32) sio = StringIO() mf.dump(sio)
def test_calculate_all_metafeatures_multilabel(self): self.helpers.clear() X, y = self.get_multilabel() categorical = [False] * 10 mf = meta_features.calculate_all_metafeatures(X, y, categorical, "Generated") self.assertEqual(52, len(mf.metafeature_values)) sio = StringIO() mf.dump(sio)
def test_calculate_all_metafeatures_multilabel(self): self.helpers.clear() X, y = self.get_multilabel() categorical = [False] * 10 mf = meta_features.calculate_all_metafeatures( X, y, categorical, "Generated") self.assertEqual(52, len(mf.metafeature_values)) sio = StringIO() mf.dump(sio)
def getFeatures(x, y, name): """ x: numpy array, X of the dataset y: 1-D numpy array, Y of the dataset name: string, name of the dataset Returns: DatasetMetafeatures object (found in autosklearn/metalearning/metafeatures/metafeature.py) """ with warnings.catch_warnings(): warnings.simplefilter("ignore", category=RuntimeWarning) metafeatures_labels = metafeatures.calculate_all_metafeatures(x, y, [False] * x.shape[1], name) return metafeatures_labels
def test_calculate_all_metafeatures(sparse_data): X, y, categorical = sparse_data mf = meta_features.calculate_all_metafeatures( X, y, categorical, "2", logger=logging.getLogger('Meta')) assert 52 == len(mf.metafeature_values)
def test_calculate_all_metafeatures_same_results_across_datatypes(): """ This test makes sure that numpy and pandas produce the same metafeatures. This also is an excuse to fully test anneal dataset, and make sure all metafeatures work in this complex dataset """ X, y = fetch_openml(data_id=2, return_X_y=True, as_frame=True) categorical = {col: True if X[col].dtype.name == 'category' else False for col in X.columns} mf = meta_features.calculate_all_metafeatures( X, y, categorical, "2", logger=logging.getLogger('Meta')) assert 52 == len(mf.metafeature_values) expected = { 'PCASkewnessFirstPC': 0.41897660337677867, 'PCAKurtosisFirstPC': -0.677692541156901, 'PCAFractionOfComponentsFor95PercentVariance': 0.2716049382716049, 'ClassEntropy': 1.1898338562043977, 'SkewnessSTD': 7.540418815675546, 'SkewnessMean': 1.47397188548894, 'SkewnessMax': 29.916569235579203, 'SkewnessMin': -29.916569235579203, 'KurtosisSTD': 153.0563504598898, 'KurtosisMean': 56.998860939761165, 'KurtosisMax': 893.0011148272025, 'KurtosisMin': -3.0, 'SymbolsSum': 49, 'SymbolsSTD': 1.3679553264445183, 'SymbolsMean': 1.8846153846153846, 'SymbolsMax': 7, 'SymbolsMin': 1, 'ClassProbabilitySTD': 0.28282850691819206, 'ClassProbabilityMean': 0.2, 'ClassProbabilityMax': 0.7616926503340757, 'ClassProbabilityMin': 0.008908685968819599, 'InverseDatasetRatio': 23.63157894736842, 'DatasetRatio': 0.042316258351893093, 'RatioNominalToNumerical': 5.333333333333333, 'RatioNumericalToNominal': 0.1875, 'NumberOfCategoricalFeatures': 32, 'NumberOfNumericFeatures': 6, 'NumberOfMissingValues': 22175.0, 'NumberOfFeaturesWithMissingValues': 29.0, 'NumberOfInstancesWithMissingValues': 898.0, 'NumberOfFeatures': 38.0, 'NumberOfClasses': 5.0, 'NumberOfInstances': 898.0, 'LogInverseDatasetRatio': 3.162583908575814, 'LogDatasetRatio': -3.162583908575814, 'PercentageOfMissingValues': 0.6498358926268901, 'PercentageOfFeaturesWithMissingValues': 0.7631578947368421, 'PercentageOfInstancesWithMissingValues': 1.0, 'LogNumberOfFeatures': 3.6375861597263857, 'LogNumberOfInstances': 6.8001700683022, } assert {k: mf[k].value for k in expected.keys()} == pytest.approx(expected) expected_landmarks = { 'Landmark1NN': 0.9721601489757914, 'LandmarkRandomNodeLearner': 0.7616945996275606, 'LandmarkDecisionNodeLearner': 0.7827932960893855, 'LandmarkDecisionTree': 0.9899875853507139, 'LandmarkNaiveBayes': 0.9287150837988827, 'LandmarkLDA': 0.9610242085661079, } assert {k: mf[k].value for k in expected_landmarks.keys()} == pytest.approx( expected_landmarks, rel=1e-5) # Then do numpy! X, y = fetch_openml(data_id=2, return_X_y=True, as_frame=False) categorical = {i: True if category else False for i, category in enumerate(categorical.values())} mf = meta_features.calculate_all_metafeatures( X, y, categorical, "2", logger=logging.getLogger('Meta')) assert {k: mf[k].value for k in expected.keys()} == pytest.approx(expected) # The column-reorder of pandas and numpy array are different after # the data preprocessing. So we cannot directly compare, and landmarking is # sensible to column order expected_landmarks['LandmarkDecisionTree'] = 0.9922098075729361 assert {k: mf[k].value for k in expected_landmarks.keys()} == pytest.approx( expected_landmarks, rel=1e-5)
def test_calculate_all_metafeatures(meta_train_data): X, y, categorical = meta_train_data mf = meta_features.calculate_all_metafeatures( X, y, categorical, "2", logger=logging.getLogger('Meta')) assert 52 == len(mf.metafeature_values) assert mf.metafeature_values['NumberOfCategoricalFeatures'].value == 32