예제 #1
0
    def test_compute_effects_on_compute(self):
        """
        Tests whether computing metafeatures has any side effects on the
        instance metafeatures object. Fails if there are any side effects.
        """
        required_checks = []
        test_failures = {}
        test_name = inspect.stack()[0][3]
        for dataset_filename, dataset in self.datasets.items():
            metafeatures_instance = Metafeatures()
            # first run
            metafeatures_instance.compute(X=dataset["X"],
                                          Y=dataset["Y"],
                                          seed=CORRECTNESS_SEED,
                                          column_types=dataset["column_types"])
            # second run
            computed_mfs = metafeatures_instance.compute(
                X=dataset["X"],
                Y=dataset["Y"],
                seed=CORRECTNESS_SEED,
                column_types=dataset["column_types"])

            known_mfs = dataset["known_metafeatures"]
            required_checks.append(
                (self._check_correctness,
                 [computed_mfs, known_mfs, dataset_filename]))
            test_failures.update(self._perform_checks(required_checks))
        self._report_test_failures(test_failures, test_name)
예제 #2
0
    def test_n_folds_with_small_dataset(self):
        # should raise error with small (few instances) dataset
        # unless not computing landmarking mfs
        X_small = pd.DataFrame(np.random.rand(3, 7))
        Y_small = pd.Series([0, 1, 0], name="target").astype("str")
        metafeatures = Metafeatures()

        with self.assertRaises(ValueError) as cm:
            metafeatures.compute(X_small, Y_small, n_folds=2)
        self.assertEqual(
            str(cm.exception),
            "The minimum number of instances in each class of Y is n_folds=2."
            + " Class 1 has 1.")
예제 #3
0
    def test_numeric_targets(self):
        """ Test Metafeatures().compute() with numeric targets
        """
        test_failures = {}
        test_name = inspect.stack()[0][3]
        for dataset_filename, dataset in self.datasets.items():
            metafeatures = Metafeatures()
            column_types = dataset["column_types"].copy()
            column_types[dataset["Y"].name] = consts.NUMERIC
            computed_mfs = metafeatures.compute(
                X=dataset["X"],
                Y=pd.Series(np.random.rand(dataset["Y"].shape[0]),
                            name=dataset["Y"].name),
                seed=CORRECTNESS_SEED,
                column_types=column_types)
            known_mfs = dataset["known_metafeatures"]
            target_dependent_metafeatures = Metafeatures.list_metafeatures(
                consts.MetafeatureGroup.TARGET_DEPENDENT.value)
            for mf_name in target_dependent_metafeatures:
                known_mfs[mf_name] = {
                    consts.VALUE_KEY: consts.NUMERIC_TARGETS,
                    consts.COMPUTE_TIME_KEY: 0.
                }

            required_checks = [(self._check_correctness,
                                [computed_mfs, known_mfs, dataset_filename]),
                               (self._check_compare_metafeature_lists,
                                [computed_mfs, known_mfs, dataset_filename])]
            test_failures.update(self._perform_checks(required_checks))

        self._report_test_failures(test_failures, test_name)
예제 #4
0
    def test_soft_timeout(self):
        """Tests Metafeatures().compute() with timeout set"""
        test_name = inspect.stack()[0][3]
        test_failures = {}
        for dataset_filename, dataset in self.datasets.items():
            metafeatures = Metafeatures()

            start_time = time.time()
            metafeatures.compute(X=dataset["X"],
                                 Y=dataset["Y"],
                                 seed=CORRECTNESS_SEED,
                                 column_types=dataset["column_types"])
            full_compute_time = time.time() - start_time

            start_time = time.time()
            computed_mfs = metafeatures.compute(
                X=dataset["X"],
                Y=dataset["Y"],
                seed=CORRECTNESS_SEED,
                column_types=dataset["column_types"],
                timeout=full_compute_time / 2)
            limited_compute_time = time.time() - start_time

            self.assertGreater(
                full_compute_time, limited_compute_time,
                f"Compute metafeatures exceeded timeout on '{dataset_filename}'"
            )
            computed_mfs_timeout = {
                k: v
                for k, v in computed_mfs.items()
                if v[consts.VALUE_KEY] != consts.TIMEOUT
            }
            known_mfs = dataset["known_metafeatures"]
            required_checks = [
                (self._check_correctness,
                 [computed_mfs_timeout, known_mfs, dataset_filename]),
                (self._check_compare_metafeature_lists,
                 [computed_mfs, known_mfs, dataset_filename])
            ]

        test_failures.update(self._perform_checks(required_checks))
        self._report_test_failures(test_failures, test_name)
예제 #5
0
 def test_sampling_shape_correctness(self):
     sample_shape = (7, 13)
     metafeatures = Metafeatures()
     dummy_mf_df = metafeatures.compute(self.dummy_features,
                                        self.dummy_target,
                                        sample_shape=sample_shape)
     X_sample = metafeatures._resources["XSample"]["value"]
     self.assertEqual(
         X_sample.shape, sample_shape,
         f"Sampling produced incorrect shape {X_sample.shape}; should have"
         + f" been {sample_shape}.")
예제 #6
0
def run_metafeature_benchmark(benchmark_name, iters=100):
    """
    Computes metafeatures `iters` times over the test datasets and stores
    comparable information in ./<benchmark_name>.json.
    """
    with open(METADATA_PATH, "r") as f:
        dataset_descriptions = json.load(f)
    benchmark_data = {}
    for dataset_metadata in dataset_descriptions:
        print(dataset_metadata["filename"])
        X, Y, column_types = read_dataset(dataset_metadata)
        init_times = []
        total_compute_times = []
        metafeature_compute_times = {mf_id: [] for mf_id in Metafeatures.IDS}
        for i in range(iters):
            print(f"iter {i}")
            start_timestamp = time.time()
            mf = Metafeatures()
            init_timestamp = time.time()
            computed_mfs = mf.compute(X=X,
                                      Y=Y,
                                      column_types=column_types,
                                      seed=CORRECTNESS_SEED)
            compute_timestamp = time.time()
            init_times.append(init_timestamp - start_timestamp)
            total_compute_times.append(compute_timestamp - init_timestamp)
            for mf_id, result in computed_mfs.items():
                metafeature_compute_times[mf_id].append(
                    result[consts.COMPUTE_TIME_KEY])
        benchmark_data[dataset_metadata["filename"]] = {
            "init_time": {
                "mean": np.mean(init_times),
                "std_dev": np.std(init_times)
            },
            "total_compute_time": {
                "mean": np.mean(total_compute_times),
                "std_dev": np.std(total_compute_times)
            },
            "metafeature_compute_time": {
                mf_id: {
                    "mean": np.mean(mf_times),
                    "std_dev": np.std(mf_times)
                }
                for mf_id, mf_times in metafeature_compute_times.items()
            }
        }
    write_benchmark_data(benchmark_name, benchmark_data)
def get_list_metafeatures(list_X, list_y, type_metafeatures):
    metafeatures = Metafeatures()
    list_dataset_metafeatures = []

    for X, y in tqdm(zip(list_X, list_Y), total=7084):
        mfs = metafeatures.compute(
            pd.DataFrame(X),
            Y=pd.Series(y, dtype="category"),
            metafeature_ids=metafeatures.list_metafeatures(
                group=type_metafeatures),
            exclude=None,
            seed=0,
            #verbose=True,
            timeout=60,
            # return_times=True,
        )
        list_dataset_metafeatures.append(
            pd.DataFrame(mfs).reset_index(drop=True))

    df_metafeatures = pd.concat(list_dataset_metafeatures).fillna(0)
    df_metafeatures["index"] = list_files
    df_metafeatures.set_index("index", inplace=True)
    return df_metafeatures
예제 #8
0
import pandas as pd
import numpy as np
from metalearn import Metafeatures

base = pd.read_csv('kddcup99.csv')

print("Informações da Base de Dados")
print("Quantidade de linhas e colunas: ", base.shape)
print("Descrição do Index: ", base.index)
print("Colunas presentes: ", base.columns)
print("Colunas presentes: ", base.count)

X = base.drop('label', axis=1)
Y = base['label']

metafeatures = Metafeatures()
mfs = metafeatures.compute(X, Y)

print(mfs)

metafeatures_output = open('metafeatures_output.txt', 'w')

metafeatures_output.close()