def _save_data(self, X_train, y_train, X_validation=None, y_validation=None): self._X_train_path = os.path.join(self._results_path, "X_train.parquet") self._y_train_path = os.path.join(self._results_path, "y_train.parquet") X_train.to_parquet(self._X_train_path, index=False) if self._ml_task == MULTICLASS_CLASSIFICATION: y_train = y_train.astype(str) pd.DataFrame({"target": y_train}).to_parquet(self._y_train_path, index=False) self._validation["X_train_path"] = self._X_train_path self._validation["y_train_path"] = self._y_train_path self._validation["results_path"] = self._results_path columns_and_target_info = DataInfo.compute(X_train, y_train, self._ml_task) self._data_info = { "columns": X_train.columns.tolist(), "rows": X_train.shape[0], "cols": X_train.shape[1], "target_is_numeric": pd.api.types.is_numeric_dtype(y_train), "columns_info": columns_and_target_info["columns_info"], "target_info": columns_and_target_info["target_info"], } if columns_and_target_info.get("num_class") is not None: self._data_info["num_class"] = columns_and_target_info["num_class"] data_info_path = os.path.join(self._results_path, "data_info.json") with open(data_info_path, "w") as fout: fout.write(json.dumps(self._data_info, indent=4)) self._drop_data_variables(X_train)
def _save_data(self, X, y): self._X_path = os.path.join(self._results_path, "X.parquet") self._y_path = os.path.join(self._results_path, "y.parquet") X.to_parquet(self._X_path, index=False) # let's check before any conversions target_is_numeric = pd.api.types.is_numeric_dtype(y) if self._ml_task == MULTICLASS_CLASSIFICATION: y = y.astype(str) pd.DataFrame({"target": y}).to_parquet(self._y_path, index=False) self._validation_strategy["X_path"] = self._X_path self._validation_strategy["y_path"] = self._y_path self._validation_strategy["results_path"] = self._results_path columns_and_target_info = DataInfo.compute(X, y, self._ml_task) self.n_features_in_ = X.shape[1] self.n_classes = len(np.unique(y[~pd.isnull(y)])) self._data_info = { "columns": X.columns.tolist(), "rows": y.shape[0], "cols": X.shape[1], "target_is_numeric": target_is_numeric, "columns_info": columns_and_target_info["columns_info"], "target_info": columns_and_target_info["target_info"], "n_features": self.n_features_in_, } # Add n_classes if not regression if self._ml_task != REGRESSION: self._data_info["n_classes"] = self.n_classes if columns_and_target_info.get("num_class") is not None: self._data_info["num_class"] = columns_and_target_info["num_class"] data_info_path = os.path.join(self._results_path, "data_info.json") with open(data_info_path, "w") as fout: fout.write(json.dumps(self._data_info, indent=4)) self._drop_data_variables(X)