Exemplo n.º 1
0
def _train_test_split_with_object(X,
                                  y,
                                  shuffle=True,
                                  random_state=None,
                                  stratify=None,
                                  **kwargs):
    """
    cuml.train_test_split raise exception if y.dtype=='object', so we encode it
    """
    le = LabelEncoder()
    yt = le.fit_transform(y)

    if stratify is y:
        stratify = yt
    elif stratify is not None and str(stratify.dtype) == 'object':
        stratify = LabelEncoder().fit_transform(stratify)

    X_train, X_test, y_train, y_test = \
        train_test_split(X, yt, shuffle=shuffle, random_state=random_state, stratify=stratify, **kwargs)

    y_train_decoded = le.inverse_transform(y_train)
    y_test_decoded = le.inverse_transform(y_test)
    y_train_decoded.index = y_train.index
    y_test_decoded.index = y_test.index

    return X_train, X_test, y_train_decoded, y_test_decoded
Exemplo n.º 2
0
    def fit(self, X, y=None):
        """
        Fit OneHotEncoder to X.

        Parameters
        ----------
        X : cuDF.DataFrame or cupy.ndarray, shape = (n_samples, n_features)
            The data to determine the categories of each feature.
        y : None
            Ignored. This parameter exists for compatibility only.

        Returns
        -------
        self

        """
        self._validate_keywords()
        X = self._check_input_fit(X)
        if type(self.categories) is str and self.categories == 'auto':
            self._features = X.columns
            self._encoders = {
                feature: LabelEncoder(handle=self.handle,
                                      verbose=self.verbose,
                                      output_type=self.output_type,
                                      handle_unknown=self.handle_unknown).fit(
                                          self._unique(X[feature]))
                for feature in self._features
            }
        else:
            self.categories = self._check_input_fit(self.categories, True)
            self._features = self.categories.columns
            if len(self._features) != X.shape[1]:
                raise ValueError("Shape mismatch: if categories is not 'auto',"
                                 " it has to be of shape (n_features, _).")
            self._encoders = dict()
            for feature in self._features:

                le = LabelEncoder(handle=self.handle,
                                  verbose=self.verbose,
                                  output_type=self.output_type,
                                  handle_unknown=self.handle_unknown)

                self._encoders[feature] = le.fit(self.categories[feature])

                if self.handle_unknown == 'error':
                    if self._has_unknown(X[feature],
                                         self._encoders[feature].classes_):
                        msg = ("Found unknown categories in column {0}"
                               " during fit".format(feature))
                        raise KeyError(msg)

        self.drop_idx_ = self._compute_drop_idx()
        self._fitted = True
        return self
Exemplo n.º 3
0
 def label_encoder(self):
     """
   Takes the output_df and label encode any features in lbl_enc_feats list
   """
     # Loop through each feature in lbl_enc_feats and label encode it
     for feat in self.lbl_enc_feats:
         le = LabelEncoder()
         le.fit(self.output_df[feat])
         self.output_df[feat] = le.transform(self.output_df[feat])
    def create_features(
        self,
        train_df: pd.DataFrame,
        test_df: pd.DataFrame,
    ):

        with timer("load data"):
            train = train_df.copy()
            len_train = len(train)
            org_cols = train.columns.tolist()
            test = test_df.copy()

        with timer("concat train and test"):
            total = cudf.concat([train, test], ignore_index=True).reset_index()
            del train, test
            gc.collect()

        with timer("combi cats"):
            new_cat_df = cudf.concat(
                [
                    xfeat.ConcatCombination(
                        drop_origin=True, r=r).fit_transform(
                            total[cat_cols].astype(str).fillna("none"))
                    for r in [2, 3, 4]
                ],
                axis="columns",
            )

            for col in new_cat_df.columns:
                le = LabelEncoder()
                new_cat_df[col] = le.fit_transform(
                    new_cat_df[col]).astype("category")

            total = cudf.concat(
                [total, new_cat_df],
                axis="columns",
            )

        with timer("end"):
            total = total.sort_values("index")
            new_cols = [
                col for col in total.columns
                if col not in org_cols + ["index"]
            ]

            self.train = total[new_cols].iloc[:len_train].reset_index(
                drop=True)
            self.test = total[new_cols].iloc[len_train:].reset_index(drop=True)
Exemplo n.º 5
0
    def create_features(
        self,
        train_df: cudf.DataFrame,
        test_df: cudf.DataFrame,
    ):

        with timer("load data"):
            train = train_df.copy()
            len_train = len(train)
            test = test_df.copy()

        with timer("concat train and test"):
            total = cudf.concat([train, test], ignore_index=True)

        with timer("label encoding"):
            with timer("rating"):
                rating_dict = {
                    "RP": 0,
                    "EC": 1,
                    "K-A": 2,
                    "E": 2,
                    "E10+": 3,
                    "T": 4,
                    "M": 5,
                    "AO": 5,
                }
                total["Rating"] = total["Rating"].replace(rating_dict).astype(
                    int)

            with timer("other cat cols"):
                cat_cols = [
                    "Name",
                    "Platform",
                    "Genre",
                    "Publisher",
                    "Developer",
                ]
                for col in cat_cols:
                    le = LabelEncoder(handle_unknown="ignore")
                    le.fit(total[col])
                    total[col] = le.transform(total[col]).astype("category")

        with timer("User_Score"):
            total["User_Score"] = (total["User_Score"].replace(
                to_replace="tbd", value=np.nan).astype(float))

        with timer("Year_of_Release"):
            total["Year_of_Release"] = total["Year_of_Release"].replace(
                to_replace=2020.0, value=2017.0)

        with timer("log_User_Count"):
            total["log_User_Count"] = np.log1p(total["User_Count"].to_pandas())

        with timer("end"):
            basic_cols = [
                "Name",
                "Platform",
                "Year_of_Release",
                "Genre",
                "Publisher",
                "Critic_Score",
                "Critic_Count",
                "User_Score",
                "User_Count",
                "log_User_Count",
                "Developer",
                "Rating",
            ]
            target_cols = [
                "NA_Sales",
                "EU_Sales",
                "JP_Sales",
                "Other_Sales",
                "Global_Sales",
            ]
            self.train = total[basic_cols +
                               target_cols].iloc[:len_train].reset_index(
                                   drop=True)
            self.test = total[basic_cols].iloc[len_train:].reset_index(
                drop=True)