Пример #1
0
    def test_exclude_metafeature_groups(self):
        SUBSET_LENGTH = 3
        test_failures = {}
        test_name = inspect.stack()[0][3]
        for dataset_filename, dataset in self.datasets.items():
            groups = random.sample(
                [group.value for group in consts.MetafeatureGroup],
                SUBSET_LENGTH)
            computed_mfs = Metafeatures().compute(
                X=dataset["X"],
                Y=dataset["Y"],
                column_types=dataset["column_types"],
                seed=CORRECTNESS_SEED,
                exclude_groups=groups,
            )
            known_metafeatures = dataset["known_metafeatures"]
            required_checks = [
                (self._check_correctness,
                 [computed_mfs, known_metafeatures, dataset_filename])
            ]
            test_failures.update(self._perform_checks(required_checks))

            metafeature_ids = set(
                mf_id for group in groups
                for mf_id in Metafeatures.list_metafeatures(group))
            if any(mf_id in computed_mfs.keys() for mf_id in metafeature_ids):
                self.fail('Metafeatures computed an excluded metafeature')
        self._report_test_failures(test_failures, test_name)
Пример #2
0
    def test_numeric_targets(self):
        """ Test Metafeatures().compute() with numeric targets
        """
        test_failures = {}
        test_name = inspect.stack()[0][3]
        for dataset_filename, dataset in self.datasets.items():
            metafeatures = Metafeatures()
            column_types = dataset["column_types"].copy()
            column_types[dataset["Y"].name] = consts.NUMERIC
            computed_mfs = metafeatures.compute(
                X=dataset["X"],
                Y=pd.Series(np.random.rand(dataset["Y"].shape[0]),
                            name=dataset["Y"].name),
                seed=CORRECTNESS_SEED,
                column_types=column_types)
            known_mfs = dataset["known_metafeatures"]
            target_dependent_metafeatures = Metafeatures.list_metafeatures(
                consts.MetafeatureGroup.TARGET_DEPENDENT.value)
            for mf_name in target_dependent_metafeatures:
                known_mfs[mf_name] = {
                    consts.VALUE_KEY: consts.NUMERIC_TARGETS,
                    consts.COMPUTE_TIME_KEY: 0.
                }

            required_checks = [(self._check_correctness,
                                [computed_mfs, known_mfs, dataset_filename]),
                               (self._check_compare_metafeature_lists,
                                [computed_mfs, known_mfs, dataset_filename])]
            test_failures.update(self._perform_checks(required_checks))

        self._report_test_failures(test_failures, test_name)
Пример #3
0
 def test_n_folds_invalid_input(self):
     tests = [{
         "n_folds": 0,
         "message": "`n_folds` must be >= 2, but was 0"
     }, {
         "n_folds": 1,
         "message": "`n_folds` must be >= 2, but was 1"
     }, {
         "n_folds": 2.1,
         "message": "`n_folds` must be an integer, not 2.1"
     }, {
         "n_folds": "hello",
         "message": "`n_folds` must be an integer, not hello"
     }, {
         "n_folds": [3],
         "message": "`n_folds` must be an integer, not [3]"
     }, {
         "n_folds": {
             5: 7
         },
         "message": "`n_folds` must be an integer, not {5: 7}"
     }]
     for test in tests:
         with self.assertRaises(ValueError) as cm:
             Metafeatures().compute(self.dummy_features,
                                    self.dummy_target,
                                    n_folds=test["n_folds"])
         self.assertEqual(str(cm.exception), test["message"])
Пример #4
0
 def test_y_no_name(self):
     X = pd.DataFrame(np.random.rand(8, 2))
     y = pd.Series(['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'])
     try:
         Metafeatures().compute(X, y)
     except Exception as e:
         self.fail(e)
Пример #5
0
    def test_request_and_exclude_metafeature_groups(self):
        with self.assertRaises(ValueError) as cm:
            Metafeatures().compute(X=self.dummy_features,
                                   Y=self.dummy_target,
                                   groups=[],
                                   exclude_groups=[])

        with self.assertRaises(ValueError) as cm:
            Metafeatures().compute(X=self.dummy_features,
                                   Y=self.dummy_target,
                                   groups=['foobar'])

        with self.assertRaises(ValueError) as cm:
            Metafeatures().compute(X=self.dummy_features,
                                   Y=self.dummy_target,
                                   exclude_groups=['foobar'])
Пример #6
0
    def test_compute_effects_on_compute(self):
        """
        Tests whether computing metafeatures has any side effects on the
        instance metafeatures object. Fails if there are any side effects.
        """
        required_checks = []
        test_failures = {}
        test_name = inspect.stack()[0][3]
        for dataset_filename, dataset in self.datasets.items():
            metafeatures_instance = Metafeatures()
            # first run
            metafeatures_instance.compute(X=dataset["X"],
                                          Y=dataset["Y"],
                                          seed=CORRECTNESS_SEED,
                                          column_types=dataset["column_types"])
            # second run
            computed_mfs = metafeatures_instance.compute(
                X=dataset["X"],
                Y=dataset["Y"],
                seed=CORRECTNESS_SEED,
                column_types=dataset["column_types"])

            known_mfs = dataset["known_metafeatures"]
            required_checks.append(
                (self._check_correctness,
                 [computed_mfs, known_mfs, dataset_filename]))
            test_failures.update(self._perform_checks(required_checks))
        self._report_test_failures(test_failures, test_name)
Пример #7
0
 def test_sampling_shape_no_exception(self):
     try:
         Metafeatures().compute(self.dummy_features,
                                self.dummy_target,
                                sample_shape=(10, 10))
     except Exception as e:
         exc_type = type(e).__name__
         self.fail(f"computing metafeatures raised {exc_type} unexpectedly")
Пример #8
0
 def test_run_without_exception(self):
     try:
         for dataset_filename, dataset in self.datasets.items():
             Metafeatures().compute(X=dataset["X"],
                                    Y=dataset["Y"],
                                    column_types=dataset["column_types"])
     except Exception as e:
         exc_type = type(e).__name__
         self.fail(f"computing metafeatures raised {exc_type} unexpectedly")
Пример #9
0
 def test_column_type_input(self):
     column_types = {
         col: consts.NUMERIC
         for col in self.dummy_features.columns
     }
     column_types[self.dummy_features.columns[2]] = consts.CATEGORICAL
     column_types[self.dummy_target.name] = consts.CATEGORICAL
     # all valid
     try:
         Metafeatures().compute(self.dummy_features, self.dummy_target,
                                column_types)
     except Exception as e:
         exc_type = type(e).__name__
         self.fail(f"computing metafeatures raised {exc_type} unexpectedly")
     # some valid
     column_types[self.dummy_features.columns[0]] = "NUMBER"
     column_types[self.dummy_features.columns[1]] = "CATEGORY"
     with self.assertRaises(ValueError) as cm:
         Metafeatures().compute(self.dummy_features, self.dummy_target,
                                column_types)
     self.assertTrue(
         str(cm.exception).startswith("Invalid column types:"),
         "Some invalid column types test failed")
     # all invalid
     column_types = {
         feature: "INVALID_TYPE"
         for feature in self.dummy_features.columns
     }
     column_types[self.dummy_target.name] = "INVALID"
     with self.assertRaises(ValueError) as cm:
         Metafeatures().compute(self.dummy_features, self.dummy_target,
                                column_types)
     self.assertTrue(
         str(cm.exception).startswith("Invalid column types:"),
         "All invalid column types test failed")
     # invalid number of column types
     del column_types[self.dummy_features.columns[0]]
     with self.assertRaises(ValueError) as cm:
         Metafeatures().compute(self.dummy_features, self.dummy_target,
                                column_types)
     self.assertTrue(
         str(cm.exception).startswith(
             "Column type not specified for column"),
         "Invalid number of column types test failed")
Пример #10
0
    def test_metafeatures_input_partial_invalid(self):
        """ Test case where only some requested and excluded metafeatures are invalid. """

        invalid_metafeatures = ["ThisIsNotValid", "ThisIsAlsoNotValid"]
        valid_metafeatures = ["NumberOfInstances", "NumberOfFeatures"]

        with self.assertRaises(ValueError) as cm:
            Metafeatures().compute(X=self.dummy_features,
                                   Y=self.dummy_target,
                                   metafeature_ids=invalid_metafeatures +
                                   valid_metafeatures)
        self._check_invalid_metafeature_exception_string(
            str(cm.exception),
            self.invalid_requested_metafeature_message_start,
            invalid_metafeatures)

        with self.assertRaises(ValueError) as cm:
            Metafeatures().compute(X=self.dummy_features,
                                   Y=self.dummy_target,
                                   exclude=invalid_metafeatures +
                                   valid_metafeatures)
        self._check_invalid_metafeature_exception_string(
            str(cm.exception), self.invalid_excluded_metafeature_message_start,
            invalid_metafeatures)

        # Order should not matter
        with self.assertRaises(ValueError) as cm:
            Metafeatures().compute(X=self.dummy_features,
                                   Y=self.dummy_target,
                                   metafeature_ids=valid_metafeatures +
                                   invalid_metafeatures)
        self._check_invalid_metafeature_exception_string(
            str(cm.exception),
            self.invalid_requested_metafeature_message_start,
            invalid_metafeatures)

        with self.assertRaises(ValueError) as cm:
            Metafeatures().compute(X=self.dummy_features,
                                   Y=self.dummy_target,
                                   exclude=valid_metafeatures +
                                   invalid_metafeatures)
        self._check_invalid_metafeature_exception_string(
            str(cm.exception), self.invalid_excluded_metafeature_message_start,
            invalid_metafeatures)
Пример #11
0
    def test_request_and_exclude_metafeatures(self):
        expected_exception_string = "metafeature_ids and exclude cannot both be non-null"

        with self.assertRaises(ValueError) as cm:
            Metafeatures().compute(X=self.dummy_features,
                                   Y=self.dummy_target,
                                   metafeature_ids=[],
                                   exclude=[])

        self.assertEqual(str(cm.exception), expected_exception_string)
Пример #12
0
 def test_target_column_with_one_unique_value(self):
     # should not raise an error
     X = pd.DataFrame(np.random.rand(100, 7))
     Y = pd.Series(np.random.randint(0, 1, 100),
                   name="target").astype("str")
     try:
         Metafeatures().compute(X, Y)
     except Exception as e:
         exc_type = type(e).__name__
         self.fail(f"computing metafeatures raised {exc_type} unexpectedly")
Пример #13
0
 def test_sampling_shape_correctness(self):
     sample_shape = (7, 13)
     metafeatures = Metafeatures()
     dummy_mf_df = metafeatures.compute(self.dummy_features,
                                        self.dummy_target,
                                        sample_shape=sample_shape)
     X_sample = metafeatures._resources["XSample"]["value"]
     self.assertEqual(
         X_sample.shape, sample_shape,
         f"Sampling produced incorrect shape {X_sample.shape}; should have"
         + f" been {sample_shape}.")
Пример #14
0
    def test_metafeatures_input_all_invalid(self):
        """ Test cases where all requested and excluded metafeatures are invalid. """

        invalid_metafeatures = ["ThisIsNotValid", "ThisIsAlsoNotValid"]

        with self.assertRaises(ValueError) as cm:
            Metafeatures().compute(X=self.dummy_features,
                                   Y=self.dummy_target,
                                   metafeature_ids=invalid_metafeatures)
        self._check_invalid_metafeature_exception_string(
            str(cm.exception),
            self.invalid_requested_metafeature_message_start,
            invalid_metafeatures)

        with self.assertRaises(ValueError) as cm:
            Metafeatures().compute(X=self.dummy_features,
                                   Y=self.dummy_target,
                                   exclude=invalid_metafeatures)
        self._check_invalid_metafeature_exception_string(
            str(cm.exception), self.invalid_excluded_metafeature_message_start,
            invalid_metafeatures)
Пример #15
0
    def test_n_folds_with_small_dataset(self):
        # should raise error with small (few instances) dataset
        # unless not computing landmarking mfs
        X_small = pd.DataFrame(np.random.rand(3, 7))
        Y_small = pd.Series([0, 1, 0], name="target").astype("str")
        metafeatures = Metafeatures()

        with self.assertRaises(ValueError) as cm:
            metafeatures.compute(X_small, Y_small, n_folds=2)
        self.assertEqual(
            str(cm.exception),
            "The minimum number of instances in each class of Y is n_folds=2."
            + " Class 1 has 1.")
Пример #16
0
 def test_output_json_compatibility(self):
     with open(METAFEATURES_JSON_SCHEMA_PATH) as f:
         mf_schema = json.load(f)
     for dataset_filename, dataset in self.datasets.items():
         computed_mfs = Metafeatures().compute(
             X=dataset["X"],
             Y=dataset["Y"],
             column_types=dataset["column_types"])
         try:
             json_computed_mfs = json.dumps(computed_mfs)
         except Exception as e:
             self.fail(
                 f"Failed to convert metafeature output to json: {str(e)}")
Пример #17
0
 def test_output_format(self):
     with open(METAFEATURES_JSON_SCHEMA_PATH) as f:
         mf_schema = json.load(f)
     for dataset_filename, dataset in self.datasets.items():
         computed_mfs = Metafeatures().compute(
             X=dataset["X"],
             Y=dataset["Y"],
             column_types=dataset["column_types"])
         try:
             jsonschema.validate(computed_mfs, mf_schema)
         except jsonschema.exceptions.ValidationError as e:
             self.fail(
                 f"Metafeatures computed from {dataset_filename} do not " +
                 "conform to schema")
Пример #18
0
 def test_compute_effects_on_dataset(self):
     """
     Tests whether computing metafeatures has any side effects on the input
     X or Y data. Fails if there are any side effects.
     """
     for dataset in self.datasets.values():
         X_copy, Y_copy = dataset["X"].copy(), dataset["Y"].copy()
         Metafeatures().compute(X=dataset["X"],
                                Y=dataset["Y"],
                                column_types=dataset["column_types"])
         if not (X_copy.equals(dataset["X"])
                 and Y_copy.equals(dataset["Y"])):
             self.assertTrue(
                 False, "Input data has changed after Metafeatures.compute")
Пример #19
0
    def test_compute_time(self):
        no_time_mfs = Metafeatures().compute(self.dummy_features,
                                             self.dummy_target,
                                             return_times=False)
        self.assertTrue(
            all(
                len(result.keys()) == 1 and consts.COMPUTE_TIME_KEY not in
                result and consts.VALUE_KEY in result
                for result in no_time_mfs.values()),
            f'return_times is set to False but some compute_times were still returned'
        )

        timed_mfs = Metafeatures().compute(
            self.dummy_features,
            self.dummy_target,
            return_times=True,
        )
        self.assertTrue(
            all(
                len(result) == 2 and consts.COMPUTE_TIME_KEY in result
                and consts.VALUE_KEY in result
                for result in timed_mfs.values()),
            f'return_times is set to True but some compute_times were not returned'
        )
Пример #20
0
def run_metafeature_benchmark(benchmark_name, iters=100):
    """
    Computes metafeatures `iters` times over the test datasets and stores
    comparable information in ./<benchmark_name>.json.
    """
    with open(METADATA_PATH, "r") as f:
        dataset_descriptions = json.load(f)
    benchmark_data = {}
    for dataset_metadata in dataset_descriptions:
        print(dataset_metadata["filename"])
        X, Y, column_types = read_dataset(dataset_metadata)
        init_times = []
        total_compute_times = []
        metafeature_compute_times = {mf_id: [] for mf_id in Metafeatures.IDS}
        for i in range(iters):
            print(f"iter {i}")
            start_timestamp = time.time()
            mf = Metafeatures()
            init_timestamp = time.time()
            computed_mfs = mf.compute(X=X,
                                      Y=Y,
                                      column_types=column_types,
                                      seed=CORRECTNESS_SEED)
            compute_timestamp = time.time()
            init_times.append(init_timestamp - start_timestamp)
            total_compute_times.append(compute_timestamp - init_timestamp)
            for mf_id, result in computed_mfs.items():
                metafeature_compute_times[mf_id].append(
                    result[consts.COMPUTE_TIME_KEY])
        benchmark_data[dataset_metadata["filename"]] = {
            "init_time": {
                "mean": np.mean(init_times),
                "std_dev": np.std(init_times)
            },
            "total_compute_time": {
                "mean": np.mean(total_compute_times),
                "std_dev": np.std(total_compute_times)
            },
            "metafeature_compute_time": {
                mf_id: {
                    "mean": np.mean(mf_times),
                    "std_dev": np.std(mf_times)
                }
                for mf_id, mf_times in metafeature_compute_times.items()
            }
        }
    write_benchmark_data(benchmark_name, benchmark_data)
Пример #21
0
def _compare_metafeatures(oml_dataset, tol, verbose):
    # get metafeatures from dataset using our metafeatures
    our_mfs = Metafeatures().compute(X=oml_dataset["X"],
                                     Y=oml_dataset["Y"],
                                     verbose=verbose)
    oml_mfs = oml_dataset["metafeatures"]
    mf_id_map = json.load(open("./tests/oml_metafeature_map.json", "r"))

    oml_exclusive_mfs = {x: v for x, v in oml_dataset["metafeatures"].items()}
    our_exclusive_mfs = {}
    consistent_mfs = {}
    inconsistent_mfs = {}

    for our_mf_id, our_mf_result in our_mfs.items():
        our_mf_value = our_mf_result[consts.VALUE_KEY]
        if our_mf_id in mf_id_map:
            oml_mf_id = mf_id_map[our_mf_id]["openmlName"]
            if oml_mf_id in oml_mfs:
                oml_exclusive_mfs.pop(oml_mf_id)
                oml_mf_value = oml_mfs[oml_mf_id]
                if type(our_mf_value) is str:
                    diff = None
                else:
                    mf_multiplier = mf_id_map[our_mf_id]["multiplier"]
                    diff = abs(our_mf_value - mf_multiplier * oml_mf_value)
                comparison = {
                    our_mf_id: {
                        "openml": mf_multiplier * oml_mf_value,
                        "metalearn": our_mf_value
                    }
                }
                if diff is None or diff > tol:
                    inconsistent_mfs.update(comparison)
                else:
                    consistent_mfs.update(comparison)
            else:
                our_exclusive_mfs[our_mf_id] = our_mf_value
        else:
            our_exclusive_mfs[our_mf_id] = our_mf_value

    return {
        "INCONSISTENT SHARED METAFEATURES": inconsistent_mfs,
        "CONSISTENT SHARED METAFEATURES": consistent_mfs,
        "OUR EXCLUSIVE METAFEATURES": our_exclusive_mfs,
        "OPENML EXCLUSIVE METAFEATURES": oml_exclusive_mfs
    }
Пример #22
0
 def test_n_folds_with_small_dataset_no_landmarkers(self):
     # should raise error with small (few instances) dataset
     # unless not computing landmarking mfs
     X_small = pd.DataFrame(np.random.rand(3, 7))
     Y_small = pd.Series([0, 1, 0], name="target").astype("str")
     metafeature_ids = [
         "NumberOfInstances", "NumberOfFeatures", "NumberOfClasses",
         "NumberOfNumericFeatures", "NumberOfCategoricalFeatures"
     ]
     try:
         Metafeatures().compute(X_small,
                                Y_small,
                                metafeature_ids=metafeature_ids,
                                n_folds=2)
     except Exception as e:
         exc_type = type(e).__name__
         self.fail(f"computing metafeatures raised {exc_type} unexpectedly")
Пример #23
0
    def test_correctness(self):
        """Tests that metafeatures are computed correctly, for known datasets.
        """
        test_failures = {}
        test_name = inspect.stack()[0][3]
        for dataset_filename, dataset in self.datasets.items():
            computed_mfs = Metafeatures().compute(
                X=dataset["X"],
                Y=dataset["Y"],
                seed=CORRECTNESS_SEED,
                column_types=dataset["column_types"])
            known_mfs = dataset["known_metafeatures"]
            required_checks = [(self._check_correctness,
                                [computed_mfs, known_mfs, dataset_filename]),
                               (self._check_compare_metafeature_lists,
                                [computed_mfs, known_mfs, dataset_filename])]
            test_failures.update(self._perform_checks(required_checks))

        self._report_test_failures(test_failures, test_name)
Пример #24
0
    def test_individual_metafeature_correctness(self):
        test_failures = {}
        test_name = inspect.stack()[0][3]
        for dataset_filename, dataset in self.datasets.items():
            known_mfs = dataset["known_metafeatures"]
            for mf_id in Metafeatures.IDS:
                computed_mfs = Metafeatures().compute(
                    X=dataset["X"],
                    Y=dataset["Y"],
                    seed=CORRECTNESS_SEED,
                    metafeature_ids=[mf_id],
                    column_types=dataset["column_types"])
                required_checks = [
                    (self._check_correctness,
                     [computed_mfs, known_mfs, dataset_filename])
                ]
                test_failures.update(self._perform_checks(required_checks))

        self._report_test_failures(test_failures, test_name)
Пример #25
0
    def test_soft_timeout(self):
        """Tests Metafeatures().compute() with timeout set"""
        test_name = inspect.stack()[0][3]
        test_failures = {}
        for dataset_filename, dataset in self.datasets.items():
            metafeatures = Metafeatures()

            start_time = time.time()
            metafeatures.compute(X=dataset["X"],
                                 Y=dataset["Y"],
                                 seed=CORRECTNESS_SEED,
                                 column_types=dataset["column_types"])
            full_compute_time = time.time() - start_time

            start_time = time.time()
            computed_mfs = metafeatures.compute(
                X=dataset["X"],
                Y=dataset["Y"],
                seed=CORRECTNESS_SEED,
                column_types=dataset["column_types"],
                timeout=full_compute_time / 2)
            limited_compute_time = time.time() - start_time

            self.assertGreater(
                full_compute_time, limited_compute_time,
                f"Compute metafeatures exceeded timeout on '{dataset_filename}'"
            )
            computed_mfs_timeout = {
                k: v
                for k, v in computed_mfs.items()
                if v[consts.VALUE_KEY] != consts.TIMEOUT
            }
            known_mfs = dataset["known_metafeatures"]
            required_checks = [
                (self._check_correctness,
                 [computed_mfs_timeout, known_mfs, dataset_filename]),
                (self._check_compare_metafeature_lists,
                 [computed_mfs, known_mfs, dataset_filename])
            ]

        test_failures.update(self._perform_checks(required_checks))
        self._report_test_failures(test_failures, test_name)
Пример #26
0
    def test_dataframe_input_error(self):
        """ Tests if `compute` gives a user-friendly error when a TypeError or ValueError occurs. """

        expected_error_message1 = "X must be of type pandas.DataFrame"
        fail_message1 = "We expect a user friendly message when the features passed to compute is not a Pandas.DataFrame."
        expected_error_message2 = "X must not be empty"
        fail_message2 = "We expect a user friendly message when the features passed to compute are empty."
        expected_error_message3 = "Y must be of type pandas.Series"
        fail_message3 = "We expect a user friendly message when the target column passed to compute is not a Pandas.Series."
        expected_error_message4 = "Y must have the same number of rows as X"
        fail_message4 = "We expect a user friendly message when the target column passed to compute has a number of rows different than X's."
        # We don't check for the Type of TypeError explicitly as any other error would fail the unit test.

        with self.assertRaises(TypeError) as cm:
            Metafeatures().compute(X=None, Y=self.dummy_target)
        self.assertEqual(str(cm.exception), expected_error_message1,
                         fail_message1)

        with self.assertRaises(TypeError) as cm:
            Metafeatures().compute(X=np.zeros((500, 50)),
                                   Y=pd.Series(np.zeros(500)))
        self.assertEqual(str(cm.exception), expected_error_message1,
                         fail_message1)

        with self.assertRaises(ValueError) as cm:
            Metafeatures().compute(X=pd.DataFrame(np.zeros((0, 50))),
                                   Y=pd.Series(np.zeros(500)))
        self.assertEqual(str(cm.exception), expected_error_message2,
                         fail_message2)

        with self.assertRaises(ValueError) as cm:
            Metafeatures().compute(X=pd.DataFrame(np.zeros((500, 0))),
                                   Y=pd.Series(np.zeros(500)))
        self.assertEqual(str(cm.exception), expected_error_message2,
                         fail_message2)

        with self.assertRaises(TypeError) as cm:
            Metafeatures().compute(X=pd.DataFrame(np.zeros((500, 50))),
                                   Y=np.random.randint(2,
                                                       size=500).astype("str"))
        self.assertEqual(str(cm.exception), expected_error_message3,
                         fail_message3)

        with self.assertRaises(ValueError) as cm:
            Metafeatures().compute(X=pd.DataFrame(np.zeros((500, 50))),
                                   Y=pd.Series(np.random.randint(2, size=0),
                                               name="target").astype("str"))
        self.assertEqual(str(cm.exception), expected_error_message4,
                         fail_message4)
Пример #27
0
 def test_sampling_shape_invalid_input(self):
     error_tests = [
         {
             "sample_shape": "bad_shape",
             "message": "`sample_shape` must be of type `tuple` or `list`"
         },
         {
             "sample_shape": {
                 0: "bad",
                 1: "shape"
             },
             "message": "`sample_shape` must be of type `tuple` or `list`"
         },
         {
             "sample_shape": (2, 2, 2),
             "message": "`sample_shape` must be of length 2"
         },
         {
             "sample_shape": [1],
             "message": "`sample_shape` must be of length 2"
         },
         {
             "sample_shape": (0, 1),
             "message": "Cannot sample less than one row"
         },
         {
             "sample_shape": (1, 0),
             "message": "Cannot sample less than 1 column"
         },
         {
             "sample_shape": (3, 10),
             # 4 based on self.dummy_target
             "message": "Cannot sample less than 4 rows from Y"
         }
     ]
     for test in error_tests:
         with self.assertRaises(ValueError) as cm:
             Metafeatures().compute(self.dummy_features,
                                    self.dummy_target,
                                    sample_shape=test["sample_shape"])
         self.assertEqual(str(cm.exception), test["message"])
Пример #28
0
    def test_request_metafeatures(self):
        SUBSET_LENGTH = 20
        test_failures = {}
        test_name = inspect.stack()[0][3]
        for dataset_filename, dataset in self.datasets.items():
            metafeature_ids = random.sample(Metafeatures.IDS, SUBSET_LENGTH)
            computed_mfs = Metafeatures().compute(
                X=dataset["X"],
                Y=dataset["Y"],
                seed=CORRECTNESS_SEED,
                metafeature_ids=metafeature_ids,
                column_types=dataset["column_types"])
            known_metafeatures = dataset["known_metafeatures"]
            required_checks = [
                (self._check_correctness,
                 [computed_mfs, known_metafeatures, dataset_filename])
            ]

            test_failures.update(self._perform_checks(required_checks))
            self.assertEqual(set(metafeature_ids), set(computed_mfs.keys()),
                             "Compute did not return requested metafeatures")
        self._report_test_failures(test_failures, test_name)
Пример #29
0
    def compute_metafeatures(self, X, y):
        try:
            self.metafeatures = Metafeatures().compute(
                X=X, Y=y, metafeature_ids=list(self.default_mf.keys()))
        except:
            logger.info('ERROR COMPUTING METAFEATURES - USING DEFAULT')
            traceback.print_exc(file=sys.stdout)
            return self.default_mf

        self.single_value_mf = {}
        for feature in self.default_mf.keys():
            if self.metafeatures.get(feature) is None:
                self.single_value_mf[feature] = 0
            else:
                v = self.metafeatures[feature]['value']
                if (v != v) or (v in ['NUMERIC_TARGETS']):
                    self.single_value_mf[feature] = 0
                else:
                    self.single_value_mf[feature] = v

        logger.info("METAFEATURES %s %s", self.single_value_mf,
                    len(self.single_value_mf))
        return self.single_value_mf
def get_list_metafeatures(list_X, list_y, type_metafeatures):
    metafeatures = Metafeatures()
    list_dataset_metafeatures = []

    for X, y in tqdm(zip(list_X, list_Y), total=7084):
        mfs = metafeatures.compute(
            pd.DataFrame(X),
            Y=pd.Series(y, dtype="category"),
            metafeature_ids=metafeatures.list_metafeatures(
                group=type_metafeatures),
            exclude=None,
            seed=0,
            #verbose=True,
            timeout=60,
            # return_times=True,
        )
        list_dataset_metafeatures.append(
            pd.DataFrame(mfs).reset_index(drop=True))

    df_metafeatures = pd.concat(list_dataset_metafeatures).fillna(0)
    df_metafeatures["index"] = list_files
    df_metafeatures.set_index("index", inplace=True)
    return df_metafeatures