def test_get_type_pandas(self):
     d = {"col1": [1, 2, 3], "col2": ["a", "b", "c"]}
     df = pd.DataFrame(data=d)
     col1_type = PreprocessingUtils.get_type(df["col1"])
     self.assertNotEqual(col1_type, PreprocessingUtils.CATEGORICAL)
     col2_type = PreprocessingUtils.get_type(df["col2"])
     self.assertNotEqual(col1_type, PreprocessingUtils.CATEGORICAL)
 def _fit_na_fill(self, X):
     for column in self._columns:
         if np.sum(pd.isnull(X[column]) == True) == 0:
             continue
         self._na_fill_params[column] = self._get_fill_value(X[column])
         if PreprocessingUtils.get_type(X[column]) == PreprocessingUtils.DATETIME:
             self._datetime_columns += [column]
    def _get_fill_value(self, x):
        # categorical type
        if PreprocessingUtils.get_type(x) == PreprocessingUtils.CATEGORICAL:
            if self._na_fill_method == PreprocessingMissingValues.FILL_NA_MIN:
                return (
                    PreprocessingMissingValues.MISSING_VALUE
                )  # add new categorical value
            return PreprocessingUtils.get_most_frequent(x)

        if PreprocessingUtils.get_type(x) == PreprocessingUtils.DATETIME:
            return PreprocessingUtils.get_most_frequent(x)

        # numerical type
        if self._na_fill_method == PreprocessingMissingValues.FILL_NA_MIN:
            return PreprocessingUtils.get_min(x) - 1.0
        if self._na_fill_method == PreprocessingMissingValues.FILL_NA_MEAN:
            return PreprocessingUtils.get_mean(x)
        return PreprocessingUtils.get_median(x)
    def _fit_categorical_convert(self, X):
        for column in self._columns:

            if PreprocessingUtils.get_type(
                    X[column]) != PreprocessingUtils.CATEGORICAL:
                # no need to convert, already a number
                continue
            # limit categories - it is needed when doing one hot encoding
            # this code is also used in predict.py file
            # and transform_utils.py
            # TODO it needs refactoring !!!
            too_much_categories = len(np.unique(list(X[column].values))) > 200
            lbl = None
            if (self._convert_method
                    == PreprocessingCategorical.CONVERT_ONE_HOT
                    and not too_much_categories):
                lbl = LabelBinarizer()
                lbl.fit(X, column)
            else:
                lbl = LabelEncoder()
                lbl.fit(X[column])

            if lbl is not None:
                self._convert_params[column] = lbl.to_json()
Пример #5
0
    def compute(X_train, y_train, eda_path):

        try:
            # check if exists
            if os.path.exists(eda_path):
                # probably the EDA analysis is already done
                # skip from here
                return
            else:
                # need to create directory for EDA analysis
                os.mkdir(eda_path)

            inform = defaultdict(list)

            if isinstance(y_train, pd.Series):

                if PreprocessingUtils.get_type(y_train) in ("categorical"):

                    plt.figure(figsize=(5, 5))
                    sns.countplot(y_train, color=BLUE)
                    plt.title("Target class distribution")
                    plt.tight_layout(pad=2.0)
                    plot_path = os.path.join(eda_path, "target.png")
                    plt.savefig(plot_path)
                    plt.close("all")

                else:

                    plt.figure(figsize=(5, 5))
                    sns.distplot(y_train, color=BLUE)
                    plt.title("Target class distribution")
                    plt.tight_layout(pad=2.0)
                    plot_path = os.path.join(eda_path, "target.png")
                    plt.savefig(plot_path)
                    plt.close("all")

                inform["missing"].append(
                    pd.isnull(y_train).sum() / y_train.shape[0])
                inform["unique"].append(y_train.nunique())
                inform["feature_type"].append(
                    PreprocessingUtils.get_type(y_train))
                inform["plot"].append("![](target.png)")
                inform["feature"].append("target")
                inform["desc"].append(y_train.describe().to_dict())

            for col in X_train.columns:

                inform["feature_type"].append(
                    PreprocessingUtils.get_type(X_train[col]))

                if PreprocessingUtils.get_type(X_train[col]) in (
                        "categorical",
                        "discrete",
                ):

                    plt.figure(figsize=(5, 5))
                    chart = sns.countplot(
                        X_train[col],
                        order=X_train[col].value_counts().iloc[:10].index,
                        color=BLUE,
                    )
                    chart.set_xticklabels(chart.get_xticklabels(), rotation=90)
                    plt.title(f"{col} class distribution")
                    plt.tight_layout(pad=2.0)
                    plot_path = os.path.join(eda_path, f"{col}.png")
                    plt.savefig(plot_path)
                    plt.close("all")

                elif PreprocessingUtils.get_type(
                        X_train[col]) in ("continous"):

                    plt.figure(figsize=(5, 5))
                    sns.distplot(X_train[col], color=BLUE)
                    plt.title(f"{col} value distribution")
                    plt.tight_layout(pad=2.0)
                    plot_path = os.path.join(eda_path, f"{col}.png")
                    plt.savefig(plot_path)
                    plt.close("all")

                elif PreprocessingUtils.get_type(X_train[col]) in ("text"):

                    plt.figure(figsize=(10, 10), dpi=70)
                    word_string = " ".join(X_train[col].str.lower())
                    wordcloud = WordCloud(
                        width=500,
                        height=500,
                        stopwords=STOPWORDS,
                        background_color="white",
                        max_words=400,
                        max_font_size=None,
                    ).generate(word_string)

                    plt.imshow(wordcloud,
                               aspect="auto",
                               interpolation="nearest")
                    plt.axis("off")
                    plot_path = os.path.join(eda_path, f"{col}.png")
                    plt.savefig(plot_path)

                elif PreprocessingUtils.get_type(X_train[col]) in ("datetime"):

                    plt.figure(figsize=(5, 5))
                    pd.to_datetime(X_train[col]).plot(grid="True", color=BLUE)
                    plt.tight_layout(pad=2.0)
                    plot_path = os.path.join(eda_path, f"{col}.png")
                    plt.savefig(plot_path)
                    plt.close("all")

                inform["missing"].append(
                    pd.isnull(X_train[col]).sum() * 100 / X_train.shape[0])

                inform["unique"].append(int(X_train[col].nunique()))
                inform["plot"].append(f"![]({col}.png)")
                inform["feature"].append(str(col))
                inform["desc"].append(X_train[col].describe().to_dict())

            df = pd.DataFrame(inform)

            with open(os.path.join(eda_path, "README.md"), "w") as fout:

                for i, row in df.iterrows():

                    fout.write(f"## Feature : {row['feature']}\n")
                    fout.write(f"- **Feature type** : {row['feature_type']}\n")
                    fout.write(f"- **Missing** : {row['missing']}%\n")
                    fout.write(f"- **Unique** : {row['unique']}\n")

                    for key in row["desc"].keys():

                        if key in ("25%", "50%", "75%"):

                            fout.write(
                                f"- **{key.capitalize()}th Percentile** : {row['desc'][key]}\n"
                            )
                        else:

                            fout.write(
                                f"- **{key.capitalize()}** :{row['desc'][key]}\n"
                            )

                    fout.write(f"- {row['plot']}\n")

            fout.close()
        except Exception as e:
            logger.error(f"There was an issue when running EDA. {str(e)}")
Пример #6
0
    def extensive_eda(X, y, save_path):

        # Check for empty dataframes in params
        if not isinstance(X, pd.DataFrame):
            raise ValueError("X should be a dataframe")
        if X.shape[0] != len(y):
            raise ValueError("X and y should have the same number of samples")

        if X.shape[1] > MAXCOL:
            X = X.iloc[:, :MAXCOL]
            warnings.warn(
                f"AutoML EDA column limit exceeded! running for first {MAXCOL} columns"
            )

        if save_path:
            if not os.path.exists(save_path):
                os.mkdir(save_path)
        else:
            raise ValueError("Please provide a valid path to save the Extensive EDA")

        plt.style.use("ggplot")
        try:

            if PreprocessingUtils.get_type(y) in ("categorical", "discrete"):

                for col in X.columns:

                    if PreprocessingUtils.get_type(X[col]) == "continous":

                        plt.figure(figsize=(5, 5))
                        for i in np.unique(y):
                            sns.kdeplot(
                                x=X.iloc[np.where(y == i)[0]][col],
                                label=f"class {i}",
                                shade=True,
                            )
                        plt.legend()
                        plt.gca().set_title(
                            f"Distribution of {col} for each class",
                            fontsize=11,
                            weight="bold",
                            alpha=0.75,
                        )
                        plt.savefig(EDA.plot_path(save_path, col + "_target"))

                    elif PreprocessingUtils.get_type(X[col]) in (
                        "categorical",
                        "discrete",
                    ):

                        if X[col].nunique() > 7:
                            warnings.warn("Considering 7 the most frequent values")

                        values = X[col].value_counts().index[:7]
                        plt.figure(figsize=(5, 5))
                        sns.countplot(
                            x=X[X[col].isin(values)][col], hue=y[X[col].isin(values)]
                        )
                        plt.gca().set_title(
                            f"Count plot of each {col}",
                            fontsize=11,
                            weight="bold",
                            alpha=0.75,
                        )
                        plt.savefig(EDA.plot_path(save_path, col + "_target"))

            elif PreprocessingUtils.get_type(y) == "continous":
                for col in X.columns:

                    if PreprocessingUtils.get_type(X[col]) == "continous":

                        plt.figure(figsize=(5, 5))
                        plt.scatter(X[col].values, y)
                        plt.gca().set_xlabel(f"{col}")
                        plt.gca().set_ylabel("target")
                        plt.gca().set_title(
                            f"Scatter plot of {col} vs target",
                            fontsize=11,
                            weight="bold",
                            alpha=0.75,
                        )

                        plt.savefig(EDA.plot_path(save_path, col + "_target"))

                    elif PreprocessingUtils.get_type(X[col]) in (
                        "categorical",
                        "discrete",
                    ):
                        if X[col].nunique() > 7:
                            warnings.warn("Considering 7 the most frequent values")

                        plt.figure(figsize=(5, 5))
                        for i in X[col].value_counts().index[:7]:
                            sns.kdeplot(
                                x=y[X[X[col] == i].index],
                                shade=True,
                                label=f"{col}_{i}",
                            )
                        plt.gca().set_title(
                            f"Distribution of target for each {col}",
                            fontsize=11,
                            weight="bold",
                            alpha=0.75,
                        )
                        plt.legend()

                        plt.savefig(EDA.plot_path(save_path, col + "_target"))

                    elif PreprocessingUtils.get_type(X[col]) == "datetime":

                        plt.figure(figsize=(5, 5))
                        plt.plot(X[col], y)
                        plt.gca().set_xticklabels(X[col].dt.date, rotation="45")
                        plt.gca().set_title(
                            f"Distribution of target over time",
                            fontsize=11,
                            weight="bold",
                            alpha=0.75,
                        )
                        plt.savefig(EDA.plot_path(save_path, col + "_target"))

            cols = [
                col
                for col in X.columns
                if PreprocessingUtils.get_type(X[col]) == "continous"
            ][:COLS]

            if len(cols) > 0:

                plt.figure(figsize=(10, 10))
                sns.heatmap(X[cols].corr())
                plt.gca().set_title("Heatmap", fontsize=11, weight="bold", alpha=0.75)

            plt.savefig(os.path.join(save_path, "heatmap"))

            with open(os.path.join(save_path, "Extensive_EDA.md"), "w") as fout:

                for col in X.columns:

                    fout.write(f"## Bivariate analysis of {col} feature with target\n")
                    fout.write("\n![]({})\n".format(EDA.plot_fname(col + "_target")))
                    fout.write("\n")
                    fout.write(
                        "------------------------------------------------------\n"
                    )

                if len(cols) > 0:
                    fout.write("## Heatmap\n")
                    fout.write("![](heatmap.png)\n")
                    fout.write("\n")
                    fout.write(
                        "------------------------------------------------------\n"
                    )

        except Exception as e:
            AutoMLException(e)
 def test_get_type_numpy_number(self):
     tmp = np.array([1, 2, 3])
     tmp_type = PreprocessingUtils.get_type(tmp)
     self.assertNotEqual(tmp_type, PreprocessingUtils.CATEGORICAL)
 def test_get_type_numpy_categorical(self):
     tmp = np.array(["a", "b", "c"])
     tmp_type = PreprocessingUtils.get_type(tmp)
     self.assertEqual(tmp_type, PreprocessingUtils.CATEGORICAL)