Exemplo n.º 1
0
    def __init__(
        self,
        train_df,
        categorical_input: List,
        numerical_input: List,
        target: str,
        valid_df=None,
        test_df=None,
        batch_size=2,
        num_workers: Optional[int] = None,
    ):
        dfs = [train_df]
        self._test_df = None

        if valid_df is not None:
            dfs.append(valid_df)

        if test_df is not None:
            # save for predict function
            self._test_df = test_df.copy()
            self._test_df.drop(target, axis=1)
            dfs.append(test_df)

        # impute missing values
        dfs = _impute(dfs, numerical_input)

        # compute train dataset stats
        self.mean, self.std = _compute_normalization(dfs[0], numerical_input)

        if dfs[0][target].dtype == object:
            # if the target is a category, not an int
            self.target_codes = _generate_codes(dfs, [target])
        else:
            self.target_codes = None

        self.codes = _generate_codes(dfs, categorical_input)

        dfs = _pre_transform(dfs, numerical_input, categorical_input,
                             self.codes, self.mean, self.std, target,
                             self.target_codes)

        # normalize
        self.cat_cols = categorical_input
        self.num_cols = numerical_input

        self._num_classes = len(train_df[target].unique())

        train_ds = PandasDataset(dfs[0], categorical_input, numerical_input,
                                 target)
        valid_ds = PandasDataset(dfs[1], categorical_input, numerical_input,
                                 target) if valid_df is not None else None
        test_ds = PandasDataset(dfs[-1], categorical_input, numerical_input,
                                target) if test_df is not None else None
        super().__init__(train_ds,
                         valid_ds,
                         test_ds,
                         batch_size=batch_size,
                         num_workers=num_workers)
Exemplo n.º 2
0
    def from_data(
        cls,
        train_df: DataFrame,
        val_df: Optional[DataFrame],
        test_df: Optional[DataFrame],
        predict_df: Optional[DataFrame],
        target_col: str,
        num_cols: List[str],
        cat_cols: List[str],
        is_regression: bool,
    ) -> 'TabularPreprocess':

        if train_df is None:
            raise MisconfigurationException("train_df is required to instantiate the TabularPreprocess")

        dfs = [train_df]

        if val_df is not None:
            dfs += [val_df]

        if test_df is not None:
            dfs += [test_df]

        if predict_df is not None:
            dfs += [predict_df]

        mean, std = _compute_normalization(dfs[0], num_cols)
        classes = list(dfs[0][target_col].unique())
        num_classes = len(classes)
        if dfs[0][target_col].dtype == object:
            # if the target_col is a category, not an int
            target_codes = _generate_codes(dfs, [target_col])
        else:
            target_codes = None
        codes = _generate_codes(dfs, cat_cols)

        return cls(
            cat_cols,
            num_cols,
            target_col,
            mean,
            std,
            codes,
            target_codes,
            classes,
            num_classes,
            is_regression,
        )
Exemplo n.º 3
0
    def generate_state(train_df: DataFrame,
                       val_df: Optional[DataFrame],
                       test_df: Optional[DataFrame],
                       predict_df: Optional[DataFrame],
                       target_col: str,
                       num_cols: List[str],
                       cat_cols: List[str],
                       is_regression: bool,
                       preprocess_state: Optional[TabularState] = None):
        if preprocess_state is not None:
            return preprocess_state

        if train_df is None:
            raise MisconfigurationException(
                "train_df is required to compute the preprocess state")

        dfs = [train_df]

        if val_df is not None:
            dfs += [val_df]

        if test_df is not None:
            dfs += [test_df]

        if predict_df is not None:
            dfs += [predict_df]

        mean, std = _compute_normalization(dfs[0], num_cols)
        num_classes = len(dfs[0][target_col].unique())
        if dfs[0][target_col].dtype == object:
            # if the target_col is a category, not an int
            target_codes = _generate_codes(dfs, [target_col])
        else:
            target_codes = None
        codes = _generate_codes(dfs, cat_cols)

        return TabularState(
            cat_cols,
            num_cols,
            target_col,
            mean,
            std,
            codes,
            target_codes,
            num_classes,
            is_regression,
        )
Exemplo n.º 4
0
    def compute_state(
        cls,
        train_data_frame: DataFrame,
        val_data_frame: Optional[DataFrame],
        test_data_frame: Optional[DataFrame],
        predict_data_frame: Optional[DataFrame],
        target_col: str,
        num_cols: List[str],
        cat_cols: List[str],
    ) -> Tuple[float, float, List[str], Dict[str, Any], Dict[str, Any]]:

        if train_data_frame is None:
            raise MisconfigurationException(
                "train_data_frame is required to instantiate the TabularDataFrameDataSource"
            )

        data_frames = [train_data_frame]

        if val_data_frame is not None:
            data_frames += [val_data_frame]

        if test_data_frame is not None:
            data_frames += [test_data_frame]

        if predict_data_frame is not None:
            data_frames += [predict_data_frame]

        mean, std = _compute_normalization(data_frames[0], num_cols)
        classes = list(data_frames[0][target_col].unique())

        if data_frames[0][target_col].dtype == object:
            # if the target_col is a category, not an int
            target_codes = _generate_codes(data_frames, [target_col])
        else:
            target_codes = None
        codes = _generate_codes(data_frames, cat_cols)

        return mean, std, classes, codes, target_codes