Python MatPipe.fit примеры использования

Язык программирования: Python

Пространство имен/Пакет: automatminer.pipeline

Класс/Тип: MatPipe

Метод/Функция: fit

Примеров на hotexamples.com: 6

Python MatPipe.fit - 6 примеров найдено. Это лучшие примеры Python кода для automatminer.pipeline.MatPipe.fit, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

MatPipe(10)

fit(6)

save(6)

predict(5)

benchmark(3)

digest(3)

load(3)

inspect(2)

from_preset(1)

summarize(1)

Пример #1

Показать файл

    def run_task(self, fw_spec):
        # Read data from fw_spec
        pipe_config_dict = fw_spec["pipe_config"]
        target = fw_spec["target"]
        data_file = fw_spec["data_file"]
        learner_name = pipe_config_dict["learner_name"]
        learner_kwargs = pipe_config_dict["learner_kwargs"]
        reducer_kwargs = pipe_config_dict["reducer_kwargs"]
        cleaner_kwargs = pipe_config_dict["cleaner_kwargs"]
        autofeaturizer_kwargs = pipe_config_dict["autofeaturizer_kwargs"]

        # Modify data_file based on computing resource
        data_dir = os.environ["AMM_DATASET_DIR"]
        data_file = os.path.join(data_dir, data_file)

        # Modify save_dir based on computing resource
        bench_dir = os.environ["AMM_SINGLE_FIT_DIR"]
        base_save_dir = fw_spec["base_save_dir"]
        base_save_dir = os.path.join(bench_dir, base_save_dir)

        if not os.path.exists(base_save_dir):
            os.makedirs(base_save_dir)

        # Set up pipeline config
        if learner_name == "TPOTAdaptor":
            learner = TPOTAdaptor(**learner_kwargs)
        elif learner_name == "rf":
            warnings.warn(
                "Learner kwargs passed into RF regressor/classifiers bc. rf being used."
            )
            learner = SinglePipelineAdaptor(
                regressor=RandomForestRegressor(**learner_kwargs),
                classifier=RandomForestClassifier(**learner_kwargs),
            )
        else:
            raise ValueError("{} not supported yet!" "".format(learner_name))
        pipe_config = {
            "learner": learner,
            "reducer": FeatureReducer(**reducer_kwargs),
            "cleaner": DataCleaner(**cleaner_kwargs),
            "autofeaturizer": AutoFeaturizer(**autofeaturizer_kwargs),
        }
        pipe = MatPipe(**pipe_config)

        # Set up dataset
        # Dataset should already be set up correctly as json beforehand.
        # this includes targets being converted to classification, removing
        # extra columns, having the names of featurization cols set to the
        # same as the matpipe config, etc.
        df = load_dataframe_from_json(data_file)

        pipe.fit(df, target)
        pipe.save(os.path.join(base_save_dir, "pipe.p"))

Пример #2

Показать файл

    def test_user_features(self):
        pipe = MatPipe(**debug_config)
        df = self.df
        df["G_VRH"] = self.extra_features
        self.assertTrue("G_VRH" in df.columns)
        self.assertTrue("K_VRH" in df.columns)
        df_train = df.iloc[:200]
        df_test = df.iloc[201:250]
        pipe.fit(df_train, self.target)

        # If shear modulus is included as a feature it should probably show up
        # in the final pipeline
        self.assertTrue("G_VRH" in pipe.learner.features)
        df_test = pipe.predict(df_test, self.target)
        true = df_test[self.target]
        test = df_test[self.target + " predicted"]
        self.assertTrue(r2_score(true, test) > 0.75)

Пример #3

Показать файл

    def test_persistence_and_digest(self):
        pipe = MatPipe(**debug_config)
        with self.assertRaises(NotFittedError):
            pipe.save()
        df = self.df[-200:]
        pipe.fit(df, self.target)

        filename = os.path.join(test_dir, "test_pipe.p")
        pipe.save(filename=filename)
        pipe = MatPipe.load(filename, logger=False)
        df_test = pipe.predict(self.df[-220:-201], self.target)
        self.assertTrue(self.target in df_test.columns)
        self.assertTrue(self.target + " predicted" in df_test.columns)

        digest_file = os.path.join(test_dir, "matdigest.txt")
        digest = pipe.digest(filename=digest_file)
        self.assertTrue(os.path.isfile(digest_file))
        self.assertTrue(isinstance(digest, str))

Пример #4

Показать файл

    def test_transferability(self):
        df_train = self.df.iloc[:200]
        df_test = self.df.iloc[201:250]
        pipe = MatPipe(**debug_config)
        pipe.fit(df_train, self.target)
        df_test = pipe.predict(df_test, self.target)
        true = df_test[self.target]
        test = df_test[self.target + " predicted"]
        self.assertTrue("composition" not in df_test.columns)
        self.assertTrue(r2_score(true, test) > 0.5)

        # Use the same pipe object by refitting and reusing
        df_train2 = self.df.iloc[250:450]
        df_test2 = self.df.iloc[451:500]
        pipe.fit(df_train2, self.target)
        df_test2 = pipe.predict(df_test2, self.target)
        true2 = df_test2[self.target]
        test2 = df_test2[self.target + " predicted"]
        self.assertTrue("composition" not in df_test2.columns)
        self.assertTrue(r2_score(true2, test2) > 0.5)

Пример #5

Показать файл

Файл: test_pipeline.py Проект: zhigangmei/automatminer

    class TestMatPipe(unittest.TestCase):
        def setUp(self):
            df = load_dataset("elastic_tensor_2015").rename(
                columns={"formula": "composition"})
            self.df = df[["composition", "K_VRH"]]
            self.df_struc = df[["composition", "structure", "K_VRH"]]
            self.extra_features = df["G_VRH"]
            self.target = "K_VRH"
            self.config = get_preset_config(config_preset, n_jobs=n_jobs)
            self.config_cached = get_preset_config(config_preset,
                                                   cache_src=CACHE_SRC,
                                                   n_jobs=n_jobs)
            self.pipe = MatPipe(**self.config)
            self.pipe_cached = MatPipe(**self.config_cached)

        @unittest.skipIf("transferability" in skip, reason)
        def test_transferability(self):
            df_train = self.df.iloc[:200]
            df_test = self.df.iloc[201:250]
            self.pipe.fit(df_train, self.target)
            df_test = self.pipe.predict(df_test)
            true = df_test[self.target]
            test = df_test[self.target + " predicted"]
            self.assertTrue("composition" not in df_test.columns)
            self.assertTrue(r2_score(true, test) > 0.5)

            # Use the same pipe object by refitting and reusing
            df_train2 = self.df.iloc[250:450]
            df_test2 = self.df.iloc[451:500]
            self.pipe.fit(df_train2, self.target)
            df_test2 = self.pipe.predict(df_test2)
            true2 = df_test2[self.target]
            test2 = df_test2[self.target + " predicted"]
            self.assertTrue("composition" not in df_test2.columns)
            self.assertTrue(r2_score(true2, test2) > 0.5)

        @unittest.skipIf("user_features" in skip, reason)
        def test_user_features(self):
            df = self.df
            df["G_VRH"] = self.extra_features
            self.assertTrue("G_VRH" in df.columns)
            self.assertTrue("K_VRH" in df.columns)
            df_train = df.iloc[:200]
            df_test = df.iloc[201:250]
            self.pipe.fit(df_train, self.target)

            # If shear modulus is included as a feature it should probably show up
            # in the final pipeline
            self.assertTrue("G_VRH" in self.pipe.learner.features)
            df_test = self.pipe.predict(df_test)
            true = df_test[self.target]
            test = df_test[self.target + " predicted"]
            self.assertTrue(r2_score(true, test) > 0.75)

        @unittest.skipIf("predict_kwargs" in skip, reason)
        def test_predict_kwargs(self):
            # Test mat_pipe.predict()'s ignore and output_col kwargs.
            df_train = self.df.iloc[:200]
            df_test = self.df.iloc[201:250]
            ef = "ExtraFeature"
            df_test[ef] = [i + 100 for i in range(df_test.shape[0])]
            self.pipe.fit(df_train, self.target)

            self.assertTrue(ef in df_test.columns)
            self.assertTrue("composition" in df_test.columns)

            ignore = [ef, "composition"]
            predicted_ignored = self.pipe.predict(df_test, ignore=ignore)
            self.assertTrue(ef in predicted_ignored.columns)
            self.assertTrue("composition" in predicted_ignored.columns)

            predicted_none = self.pipe.predict(df_test, ignore=None)
            self.assertFalse(ef in predicted_none.columns)
            self.assertFalse("composition" in predicted_none.columns)

            some = ["composition"]
            predicted_some = self.pipe.predict(df_test, ignore=some)
            self.assertFalse(ef in predicted_some.columns)
            self.assertTrue("composition" in predicted_some.columns)

            output_col_name = self.target + "_pred"
            predicted_custom_col = self.pipe.predict(
                df_test, output_col=output_col_name)
            self.assertTrue(output_col_name in predicted_custom_col)

        @unittest.skipIf("benchmarking" in skip, reason)
        def test_benchmarking_no_cache(self):
            pipe = self.pipe
            # Make sure we can't run a cached run with no cache AF and cache pipe
            with self.assertRaises(AutomatminerError):
                self._run_benchmark(cache=True, pipe=pipe)

            self._run_benchmark(cache=False, pipe=pipe)

        @unittest.skipIf("benchmarking" in skip, reason)
        def test_benchmarking_cache(self):
            pipe = self.pipe_cached

            # Make sure we can't run a cached run with no cache AF and cache pipe
            with self.assertRaises(AutomatminerError):
                self._run_benchmark(cache=False, pipe=pipe)
            self._run_benchmark(cache=True, pipe=pipe)

        @unittest.skipIf("persistence" in skip, reason)
        def test_persistence(self):
            with self.assertRaises(NotFittedError):
                self.pipe.save()
            df = self.df[-200:]
            self.pipe.fit(df, self.target)

            # Load test
            self.pipe.save(filename=PIPE_PATH)
            self.pipe = MatPipe.load(PIPE_PATH)
            df_test = self.pipe.predict(self.df[-220:-201])
            self.assertTrue(self.target in df_test.columns)
            self.assertTrue(self.target + " predicted" in df_test.columns)

            # Version test
            self.pipe.version = "not a real version"
            self.pipe.save(VERSION_PIPE_PATH)
            with self.assertRaises(AutomatminerError):
                MatPipe.load(VERSION_PIPE_PATH)

        @unittest.skipIf("digests" in skip, reason)
        def test_summarize_and_inspect(self):
            df = self.df[-200:]
            self.pipe.fit(df, self.target)

            for ext in AMM_SUPPORTED_EXTS:
                digest = self.pipe.inspect(filename=DIGEST_PATH + ext)
                self.assertTrue(os.path.isfile(DIGEST_PATH + ext))
                self.assertTrue(isinstance(digest, dict))

            for ext in AMM_SUPPORTED_EXTS:
                digest = self.pipe.summarize(filename=DIGEST_PATH + ext)
                self.assertTrue(os.path.isfile(DIGEST_PATH + ext))
                self.assertTrue(isinstance(digest, dict))

        def _run_benchmark(self, cache, pipe):
            # Test static, regular benchmark (no fittable featurizers)
            df = self.df.iloc[500:600]
            kfold = KFold(n_splits=2)
            df_tests = pipe.benchmark(df, self.target, kfold, cache=cache)
            self.assertEqual(len(df_tests), kfold.n_splits)

            # Make sure we retain a good amount of test samples...
            df_tests_all = pd.concat(df_tests)
            self.assertGreaterEqual(len(df_tests_all), 0.95 * len(df))

            # Test static subset of kfold
            df2 = self.df.iloc[500:550]
            df_tests2 = pipe.benchmark(df2,
                                       self.target,
                                       kfold,
                                       fold_subset=[0],
                                       cache=cache)
            self.assertEqual(len(df_tests2), 1)

        def tearDown(self) -> None:
            digests = [DIGEST_PATH + ext for ext in AMM_SUPPORTED_EXTS]
            for remnant in [CACHE_SRC, PIPE_PATH, VERSION_PIPE_PATH, *digests]:
                if os.path.exists(remnant):
                    os.remove(remnant)

Пример #6

Показать файл

class TestMatPipe(unittest.TestCase):
    def setUp(self):
        df = load_dataset("elastic_tensor_2015").rename(
            columns={"formula": "composition"})
        self.df = df[["composition", "K_VRH"]]
        self.df_struc = df[["composition", "structure", "K_VRH"]]
        self.extra_features = df["G_VRH"]
        self.target = "K_VRH"
        self.config = get_preset_config("debug_single")
        self.config_cached = get_preset_config("debug_single",
                                               cache_src=CACHE_SRC)
        self.pipe = MatPipe(**self.config)
        self.pipe_cached = MatPipe(**self.config_cached)

    def test_transferability(self):
        df_train = self.df.iloc[:200]
        df_test = self.df.iloc[201:250]
        self.pipe.fit(df_train, self.target)
        df_test = self.pipe.predict(df_test, self.target)
        true = df_test[self.target]
        test = df_test[self.target + " predicted"]
        self.assertTrue("composition" not in df_test.columns)
        self.assertTrue(r2_score(true, test) > 0.5)

        # Use the same pipe object by refitting and reusing
        df_train2 = self.df.iloc[250:450]
        df_test2 = self.df.iloc[451:500]
        self.pipe.fit(df_train2, self.target)
        df_test2 = self.pipe.predict(df_test2, self.target)
        true2 = df_test2[self.target]
        test2 = df_test2[self.target + " predicted"]
        self.assertTrue("composition" not in df_test2.columns)
        self.assertTrue(r2_score(true2, test2) > 0.5)

    def test_user_features(self):
        df = self.df
        df["G_VRH"] = self.extra_features
        self.assertTrue("G_VRH" in df.columns)
        self.assertTrue("K_VRH" in df.columns)
        df_train = df.iloc[:200]
        df_test = df.iloc[201:250]
        self.pipe.fit(df_train, self.target)

        # If shear modulus is included as a feature it should probably show up
        # in the final pipeline
        self.assertTrue("G_VRH" in self.pipe.learner.features)
        df_test = self.pipe.predict(df_test, self.target)
        true = df_test[self.target]
        test = df_test[self.target + " predicted"]
        self.assertTrue(r2_score(true, test) > 0.75)

    @unittest.skipIf(int(os.environ.get("SKIP_INTENSIVE", 0)),
                     "Test too intensive for CircleCI commit builds.")
    def test_benchmarking_no_cache(self):
        pipe = self.pipe
        # Make sure we can't run a cached run with no cache AF and cache pipe
        with self.assertRaises(AutomatminerError):
            self._run_benchmark(cache=True, pipe=pipe)

        self._run_benchmark(cache=False, pipe=pipe)

    @unittest.skipIf(int(os.environ.get("SKIP_INTENSIVE", 0)),
                     "Test too intensive for CircleCI commit builds.")
    def test_benchmarking_cache(self):
        pipe = self.pipe_cached

        # Make sure we can't run a cached run with no cache AF and cache pipe
        with self.assertRaises(AutomatminerError):
            self._run_benchmark(cache=False, pipe=pipe)
        self._run_benchmark(cache=True, pipe=pipe)

    def test_persistence_and_digest(self):
        with self.assertRaises(NotFittedError):
            self.pipe.save()
        df = self.df[-200:]
        self.pipe.fit(df, self.target)

        filename = os.path.join(test_dir, PIPE_PATH)
        self.pipe.save(filename=filename)
        self.pipe = MatPipe.load(filename, logger=False)
        df_test = self.pipe.predict(self.df[-220:-201], self.target)
        self.assertTrue(self.target in df_test.columns)
        self.assertTrue(self.target + " predicted" in df_test.columns)

        digest_file = os.path.join(test_dir, DIGEST_PATH)
        digest = self.pipe.digest(filename=digest_file)
        self.assertTrue(os.path.isfile(digest_file))
        self.assertTrue(isinstance(digest, str))

    def _run_benchmark(self, cache, pipe):
        # Test static, regular benchmark (no fittable featurizers)
        df = self.df.iloc[500:600]
        kfold = KFold(n_splits=5)
        df_tests = pipe.benchmark(df, self.target, kfold, cache=cache)
        self.assertEqual(len(df_tests), kfold.n_splits)

        # Make sure we retain a good amount of test samples...
        df_tests_all = pd.concat(df_tests)
        self.assertGreaterEqual(len(df_tests_all), 0.95 * len(df))

        # Test static subset of kfold
        df2 = self.df.iloc[500:550]
        df_tests2 = pipe.benchmark(df2,
                                   self.target,
                                   kfold,
                                   fold_subset=[0, 3],
                                   cache=cache)
        self.assertEqual(len(df_tests2), 2)

    def tearDown(self):
        for remnant in [CACHE_SRC, PIPE_PATH, DIGEST_PATH]:
            if os.path.exists(remnant):
                os.remove(remnant)