예제 #1
0
    def _save_data(self, X_train, y_train, X_validation=None, y_validation=None):

        self._X_train_path = os.path.join(self._results_path, "X_train.parquet")
        self._y_train_path = os.path.join(self._results_path, "y_train.parquet")

        X_train.to_parquet(self._X_train_path, index=False)

        if self._ml_task == MULTICLASS_CLASSIFICATION:
            y_train = y_train.astype(str)

        pd.DataFrame({"target": y_train}).to_parquet(self._y_train_path, index=False)

        self._validation["X_train_path"] = self._X_train_path
        self._validation["y_train_path"] = self._y_train_path
        self._validation["results_path"] = self._results_path

        columns_and_target_info = DataInfo.compute(X_train, y_train, self._ml_task)

        self._data_info = {
            "columns": X_train.columns.tolist(),
            "rows": X_train.shape[0],
            "cols": X_train.shape[1],
            "target_is_numeric": pd.api.types.is_numeric_dtype(y_train),
            "columns_info": columns_and_target_info["columns_info"],
            "target_info": columns_and_target_info["target_info"],
        }
        if columns_and_target_info.get("num_class") is not None:
            self._data_info["num_class"] = columns_and_target_info["num_class"]
        data_info_path = os.path.join(self._results_path, "data_info.json")
        with open(data_info_path, "w") as fout:
            fout.write(json.dumps(self._data_info, indent=4))

        self._drop_data_variables(X_train)
예제 #2
0
    def _save_data(self, X, y):

        self._X_path = os.path.join(self._results_path, "X.parquet")
        self._y_path = os.path.join(self._results_path, "y.parquet")

        X.to_parquet(self._X_path, index=False)

        # let's check before any conversions
        target_is_numeric = pd.api.types.is_numeric_dtype(y)
        if self._ml_task == MULTICLASS_CLASSIFICATION:
            y = y.astype(str)

        pd.DataFrame({"target": y}).to_parquet(self._y_path, index=False)

        self._validation_strategy["X_path"] = self._X_path
        self._validation_strategy["y_path"] = self._y_path
        self._validation_strategy["results_path"] = self._results_path

        columns_and_target_info = DataInfo.compute(X, y, self._ml_task)

        self.n_features_in_ = X.shape[1]
        self.n_classes = len(np.unique(y[~pd.isnull(y)]))

        self._data_info = {
            "columns": X.columns.tolist(),
            "rows": y.shape[0],
            "cols": X.shape[1],
            "target_is_numeric": target_is_numeric,
            "columns_info": columns_and_target_info["columns_info"],
            "target_info": columns_and_target_info["target_info"],
            "n_features": self.n_features_in_,
        }
        # Add n_classes if not regression
        if self._ml_task != REGRESSION:
            self._data_info["n_classes"] = self.n_classes

        if columns_and_target_info.get("num_class") is not None:
            self._data_info["num_class"] = columns_and_target_info["num_class"]
        data_info_path = os.path.join(self._results_path, "data_info.json")
        with open(data_info_path, "w") as fout:
            fout.write(json.dumps(self._data_info, indent=4))

        self._drop_data_variables(X)