Python format_x_y 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: gama.data_formatting

메소드/함수: format_x_y

hotexamples.com에서의 예제들: 3

Python format_x_y - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 gama.data_formatting.format_x_y에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

    def test_format_x_y(self):
        def well_formatted_x_y(x, y, y_type):
            assert isinstance(x, pd.DataFrame)
            assert isinstance(y, y_type)
            assert len(x) == len(y)

        from sklearn.datasets import load_digits

        X_np, y_np = load_digits(return_X_y=True)
        X_df, y_df = pd.DataFrame(X_np), pd.DataFrame(y_np)
        y_series = pd.Series(y_np)
        y_2d = y_np.reshape(-1, 1)

        for X, y in itertools.product([X_np, X_df],
                                      [y_np, y_series, y_df, y_2d]):
            well_formatted_x_y(*format_x_y(X, y), y_type=pd.Series)
            well_formatted_x_y(*format_x_y(X, y, y_type=pd.DataFrame),
                               y_type=pd.DataFrame)

예제 #2

파일 보기

    def test_format_x_y_missing_targets(self):
        """ Samples with missing labels should be removed from training data. """
        def well_formatted_x_y(x, y, y_type):
            assert isinstance(x, pd.DataFrame)
            assert isinstance(y, y_type)
            assert len(x) == len(y)

        from sklearn.datasets import load_digits

        x, y = load_digits(return_X_y=True)
        y = y.astype(float)
        y[::2] = np.nan
        x_, y_ = format_x_y(x, y)

        assert (1797, ) == y.shape
        assert (898, ) == y_.shape
        assert np.array_equal(y[1::2], y_)
        assert np.array_equal(x[1::2, :], x_)
        well_formatted_x_y(x_, y_, y_type=pd.Series)

예제 #3

파일 보기

    def fit(
        self,
        x: Union[pd.DataFrame, np.ndarray],
        y: Union[pd.DataFrame, pd.Series, np.ndarray],
        warm_start: Optional[List[Individual]] = None,
    ) -> "Gama":
        """ Find and fit a model to predict target y from X.

        Various possible machine learning pipelines will be fit to the (X,y) data.
        Using Genetic Programming, the pipelines chosen should lead to gradually
        better models. Pipelines will internally be validated using cross validation.

        After the search termination condition is met, the best found pipeline
        configuration is then used to train a final model on all provided data.

        Parameters
        ----------
        x: pandas.DataFrame or numpy.ndarray, shape = [n_samples, n_features]
            Training data. All elements must be able to be converted to float.
        y: pandas.DataFrame, pandas.Series or numpy.ndarray, shape = [n_samples,]
            Target values.
            If a DataFrame is provided, assumes the first column contains target values.
        warm_start: List[Individual], optional (default=None)
            A list of individual to start the search  procedure with.
            If None is given, random start candidates are generated.
        """
        self._time_manager = TimeKeeper(self._time_manager.total_time)

        with self._time_manager.start_activity("preprocessing",
                                               activity_meta=["default"]):
            x, self._y = format_x_y(x, y)
            self._inferred_dtypes = x.dtypes
            is_classification = hasattr(self, "_label_encoder")
            self._x, self._basic_encoding_pipeline = basic_encoding(
                x, is_classification)
            self._fixed_pipeline_extension = basic_pipeline_extension(
                self._x, is_classification)
            self._operator_set._safe_compile = partial(
                self._operator_set._compile,
                preprocessing_steps=self._fixed_pipeline_extension,
            )
            store_pipelines = (self._evaluation_library._m is None
                               or self._evaluation_library._m > 0)

            if store_pipelines and self._x.shape[0] * self._x.shape[
                    1] > 6_000_000:
                # if m > 0, we are storing models for each evaluation. For this size
                # KNN will create models of about 76Mb in size, which is too big, so
                # we exclude it from search:
                log.info(
                    "Excluding KNN from search because the dataset is too big."
                )
                from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

                self._pset["prediction"] = [
                    p for p in self._pset["prediction"] if p.identifier not in
                    [KNeighborsClassifier, KNeighborsRegressor]
                ]

            if store_pipelines and self._x.shape[1] > 50:
                log.info(
                    "Data has too many features to include PolynomialFeatures")
                from sklearn.preprocessing import PolynomialFeatures

                self._pset["data"] = [
                    p for p in self._pset["data"]
                    if p.identifier not in [PolynomialFeatures]
                ]

        fit_time = int((1 - self._post_processing.time_fraction) *
                       self._time_manager.total_time_remaining)

        with self._time_manager.start_activity(
                "search",
                time_limit=fit_time,
                activity_meta=[self._search_method.__class__.__name__],
        ):
            self._search_phase(warm_start, timeout=fit_time)

        with self._time_manager.start_activity(
                "postprocess",
                time_limit=int(self._time_manager.total_time_remaining),
                activity_meta=[self._post_processing.__class__.__name__],
        ):
            best_individuals = list(
                reversed(
                    sorted(
                        self._final_pop,
                        key=lambda ind: cast(Fitness, ind.fitness).values,
                    )))
            self._post_processing.dynamic_defaults(self)
            self.model = self._post_processing.post_process(
                self._x,
                self._y,
                self._time_manager.total_time_remaining,
                best_individuals,
            )
        if not self._store == "all":
            to_clean = dict(nothing="all", logs="evaluations", models="logs")
            self.cleanup(to_clean[self._store])
        return self