def test_format_x_y(self): def well_formatted_x_y(x, y, y_type): assert isinstance(x, pd.DataFrame) assert isinstance(y, y_type) assert len(x) == len(y) from sklearn.datasets import load_digits X_np, y_np = load_digits(return_X_y=True) X_df, y_df = pd.DataFrame(X_np), pd.DataFrame(y_np) y_series = pd.Series(y_np) y_2d = y_np.reshape(-1, 1) for X, y in itertools.product([X_np, X_df], [y_np, y_series, y_df, y_2d]): well_formatted_x_y(*format_x_y(X, y), y_type=pd.Series) well_formatted_x_y(*format_x_y(X, y, y_type=pd.DataFrame), y_type=pd.DataFrame)
def test_format_x_y_missing_targets(self): """ Samples with missing labels should be removed from training data. """ def well_formatted_x_y(x, y, y_type): assert isinstance(x, pd.DataFrame) assert isinstance(y, y_type) assert len(x) == len(y) from sklearn.datasets import load_digits x, y = load_digits(return_X_y=True) y = y.astype(float) y[::2] = np.nan x_, y_ = format_x_y(x, y) assert (1797, ) == y.shape assert (898, ) == y_.shape assert np.array_equal(y[1::2], y_) assert np.array_equal(x[1::2, :], x_) well_formatted_x_y(x_, y_, y_type=pd.Series)
def fit( self, x: Union[pd.DataFrame, np.ndarray], y: Union[pd.DataFrame, pd.Series, np.ndarray], warm_start: Optional[List[Individual]] = None, ) -> "Gama": """ Find and fit a model to predict target y from X. Various possible machine learning pipelines will be fit to the (X,y) data. Using Genetic Programming, the pipelines chosen should lead to gradually better models. Pipelines will internally be validated using cross validation. After the search termination condition is met, the best found pipeline configuration is then used to train a final model on all provided data. Parameters ---------- x: pandas.DataFrame or numpy.ndarray, shape = [n_samples, n_features] Training data. All elements must be able to be converted to float. y: pandas.DataFrame, pandas.Series or numpy.ndarray, shape = [n_samples,] Target values. If a DataFrame is provided, assumes the first column contains target values. warm_start: List[Individual], optional (default=None) A list of individual to start the search procedure with. If None is given, random start candidates are generated. """ self._time_manager = TimeKeeper(self._time_manager.total_time) with self._time_manager.start_activity("preprocessing", activity_meta=["default"]): x, self._y = format_x_y(x, y) self._inferred_dtypes = x.dtypes is_classification = hasattr(self, "_label_encoder") self._x, self._basic_encoding_pipeline = basic_encoding( x, is_classification) self._fixed_pipeline_extension = basic_pipeline_extension( self._x, is_classification) self._operator_set._safe_compile = partial( self._operator_set._compile, preprocessing_steps=self._fixed_pipeline_extension, ) store_pipelines = (self._evaluation_library._m is None or self._evaluation_library._m > 0) if store_pipelines and self._x.shape[0] * self._x.shape[ 1] > 6_000_000: # if m > 0, we are storing models for each evaluation. For this size # KNN will create models of about 76Mb in size, which is too big, so # we exclude it from search: log.info( "Excluding KNN from search because the dataset is too big." ) from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor self._pset["prediction"] = [ p for p in self._pset["prediction"] if p.identifier not in [KNeighborsClassifier, KNeighborsRegressor] ] if store_pipelines and self._x.shape[1] > 50: log.info( "Data has too many features to include PolynomialFeatures") from sklearn.preprocessing import PolynomialFeatures self._pset["data"] = [ p for p in self._pset["data"] if p.identifier not in [PolynomialFeatures] ] fit_time = int((1 - self._post_processing.time_fraction) * self._time_manager.total_time_remaining) with self._time_manager.start_activity( "search", time_limit=fit_time, activity_meta=[self._search_method.__class__.__name__], ): self._search_phase(warm_start, timeout=fit_time) with self._time_manager.start_activity( "postprocess", time_limit=int(self._time_manager.total_time_remaining), activity_meta=[self._post_processing.__class__.__name__], ): best_individuals = list( reversed( sorted( self._final_pop, key=lambda ind: cast(Fitness, ind.fitness).values, ))) self._post_processing.dynamic_defaults(self) self.model = self._post_processing.post_process( self._x, self._y, self._time_manager.total_time_remaining, best_individuals, ) if not self._store == "all": to_clean = dict(nothing="all", logs="evaluations", models="logs") self.cleanup(to_clean[self._store]) return self