def test_X_y_from_arff(): # https://www.openml.org/d/23380 x, y = X_y_from_arff("tests/data/openml_d_23380.arff", split_column="TR") assert isinstance(x, pd.DataFrame) assert (2796, 34) == x.shape assert 68100 == x.isnull().sum().sum() assert 32 == sum([dtype in NUMERIC_TYPES for dtype in x.dtypes]) assert 2 == sum([dtype.name == "category" for dtype in x.dtypes]) assert isinstance(y, pd.Series) assert (2796, ) == y.shape assert 0 == y.isnull().sum() assert 6 == len(y.dtype.categories)
def predict_proba_arff( self, arff_file_path: str, target_column: Optional[str] = None, encoding: Optional[str] = None, ): """ Predict the class probabilities for input in the arff_file. Parameters ---------- arff_file_path: str An ARFF file with the same columns as the one that used in fit. Target column must be present in file, but its values are ignored. target_column: str, optional (default=None) Specifies which column the model should predict. If left None, the last column is taken to be the target. encoding: str, optional Encoding of the ARFF file. Returns ------- numpy.ndarray Numpy array with class probabilities. The array is of shape (N, K) where N is len(X), and K is the number of class labels found in `y` of `fit`. """ x, _ = X_y_from_arff(arff_file_path, target_column, encoding) x = self._prepare_for_prediction(x) return self._predict_proba(x)
def score_arff( self, arff_file_path: str, target_column: Optional[str] = None, encoding: Optional[str] = None, ) -> float: """ Calculate `self.scoring` metric of the model on data in the file. Parameters ---------- arff_file_path: str An ARFF file with which to calculate the score. target_column: str, optional (default=None) Specifies which column the model should predict. If left None, the last column is taken to be the target. encoding: str, optional Encoding of the ARFF file. Returns ------- float The score obtained on the given test data according to the `scoring` metric. """ x, y = X_y_from_arff(arff_file_path, split_column=target_column, encoding=encoding) return self.score(x, y)
def fit_arff( self, arff_file_path: str, target_column: Optional[str] = None, encoding: Optional[str] = None, *args, **kwargs, ) -> None: """ Find and fit a model to predict the target column (last) from other columns. Parameters ---------- arff_file_path: str Path to an ARFF file containing the training data. target_column: str, optional (default=None) Specifies which column the model should predict. If left None, the last column is taken to be the target. encoding: str, optional Encoding of the ARFF file. """ x, y = X_y_from_arff(arff_file_path, split_column=target_column, encoding=encoding) self.fit(x, y, *args, **kwargs)
def predict_arff( self, arff_file_path: str, target_column: Optional[str] = None, encoding: Optional[str] = None, ) -> np.ndarray: """ Predict the target for input found in the ARFF file. Parameters ---------- arff_file_path: str An ARFF file with the same columns as the one that used in fit. Target column must be present in file, but its values are ignored. target_column: str, optional (default=None) Specifies which column the model should predict. If left None, the last column is taken to be the target. encoding: str, optional Encoding of the ARFF file. Returns ------- numpy.ndarray array with predictions for each row in the ARFF file. """ x, _ = X_y_from_arff(arff_file_path, split_column=target_column, encoding=encoding) x = self._prepare_for_prediction(x) return self._predict(x)