예제 #1
0
    def test_exclude_metafeature_groups(self):
        SUBSET_LENGTH = 3
        test_failures = {}
        test_name = inspect.stack()[0][3]
        for dataset_filename, dataset in self.datasets.items():
            groups = random.sample(
                [group.value for group in consts.MetafeatureGroup],
                SUBSET_LENGTH)
            computed_mfs = Metafeatures().compute(
                X=dataset["X"],
                Y=dataset["Y"],
                column_types=dataset["column_types"],
                seed=CORRECTNESS_SEED,
                exclude_groups=groups,
            )
            known_metafeatures = dataset["known_metafeatures"]
            required_checks = [
                (self._check_correctness,
                 [computed_mfs, known_metafeatures, dataset_filename])
            ]
            test_failures.update(self._perform_checks(required_checks))

            metafeature_ids = set(
                mf_id for group in groups
                for mf_id in Metafeatures.list_metafeatures(group))
            if any(mf_id in computed_mfs.keys() for mf_id in metafeature_ids):
                self.fail('Metafeatures computed an excluded metafeature')
        self._report_test_failures(test_failures, test_name)
예제 #2
0
    def test_request_metafeatures(self):
        SUBSET_LENGTH = 20
        test_failures = {}
        test_name = inspect.stack()[0][3]
        for dataset_filename, dataset in self.datasets.items():
            metafeature_ids = random.sample(Metafeatures.IDS, SUBSET_LENGTH)
            computed_mfs = Metafeatures().compute(
                X=dataset["X"],
                Y=dataset["Y"],
                seed=CORRECTNESS_SEED,
                metafeature_ids=metafeature_ids,
                column_types=dataset["column_types"])
            known_metafeatures = dataset["known_metafeatures"]
            required_checks = [
                (self._check_correctness,
                 [computed_mfs, known_metafeatures, dataset_filename])
            ]

            test_failures.update(self._perform_checks(required_checks))
            self.assertEqual(set(metafeature_ids), set(computed_mfs.keys()),
                             "Compute did not return requested metafeatures")
        self._report_test_failures(test_failures, test_name)
예제 #3
0
    def test_exclude_metafeatures(self):
        SUBSET_LENGTH = 20
        test_failures = {}
        test_name = inspect.stack()[0][3]
        for dataset_filename, dataset in self.datasets.items():
            metafeature_ids = random.sample(Metafeatures.IDS, SUBSET_LENGTH)
            computed_mfs = Metafeatures().compute(
                X=dataset["X"],
                Y=dataset["Y"],
                seed=CORRECTNESS_SEED,
                exclude=metafeature_ids,
                column_types=dataset["column_types"])
            known_metafeatures = dataset["known_metafeatures"]
            required_checks = [
                (self._check_correctness,
                 [computed_mfs, known_metafeatures, dataset_filename])
            ]

            test_failures.update(self._perform_checks(required_checks))
            if any(mf_id in computed_mfs.keys() for mf_id in metafeature_ids):
                self.assertTrue(
                    False, "Metafeatures computed an excluded metafeature")

        self._report_test_failures(test_failures, test_name)
예제 #4
0
def compute_dataset_metafeatures():
    metadata = json.load(open(METADATA_PATH, "r"))
    for dataset_metadata in metadata:
        dataset_filename = dataset_metadata["filename"]

        choice = None
        while not choice in ["y", "v", "n"]:
            choice = input(dataset_filename + " [(y)es, (v)erbose, (n)o]: ")

        if choice == "n":
            continue

        X, Y, column_types = read_dataset(dataset_metadata)

        start_time = time.time()
        computed_mfs = Metafeatures().compute(X=X,
                                              Y=Y,
                                              column_types=column_types,
                                              seed=CORRECTNESS_SEED)
        run_time = time.time() - start_time

        if choice == "v":
            known_mf_path = get_dataset_metafeatures_path(dataset_filename)
            with open(known_mf_path, 'r') as fp:
                known_mfs = json.load(fp)

            new_mfs = {}
            deleted_mfs = {}
            updated_mfs = {}
            same_mfs = {}
            all_mf_names = set(
                list(computed_mfs.keys()) + list(known_mfs.keys()))
            for mf in all_mf_names:
                if mf not in known_mfs.keys():
                    new_mfs[mf] = computed_mfs[mf]
                elif mf not in computed_mfs.keys():
                    deleted_mfs[mf] = known_mfs[mf]
                elif is_close(computed_mfs[mf]['value'],
                              known_mfs[mf]['value']):
                    same_mfs[mf] = computed_mfs[mf]
                else:
                    updated_mfs[mf] = {
                        'known': known_mfs[mf],
                        'computed': computed_mfs[mf]
                    }

            print('UNCHANGED METAFEATURES')
            print(json.dumps(same_mfs, sort_keys=True, indent=4))
            print('DELETED METAFEATURES')
            print(json.dumps(deleted_mfs, sort_keys=True, indent=4))
            print('NEW METAFEATURES')
            print(json.dumps(new_mfs, sort_keys=True, indent=4))
            print('UPDATED METAFEATURES')
            print(json.dumps(updated_mfs, sort_keys=True, indent=4))

        print("Runtime: " + str(run_time))

        choice = None
        while not choice in ["y", "n"]:
            choice = input(
                f"Update {dataset_filename} metafeatures? [(y)es, (n)o]: ")
        if choice == "y":
            mf_file_path = get_dataset_metafeatures_path(dataset_filename)
            with open(mf_file_path, 'w') as fp:
                json.dump(computed_mfs, fp, sort_keys=True, indent=4)