def test_get_type_pandas(self): d = {"col1": [1, 2, 3], "col2": ["a", "b", "c"]} df = pd.DataFrame(data=d) col1_type = PreprocessingUtils.get_type(df["col1"]) self.assertNotEqual(col1_type, PreprocessingUtils.CATEGORICAL) col2_type = PreprocessingUtils.get_type(df["col2"]) self.assertNotEqual(col1_type, PreprocessingUtils.CATEGORICAL)
def _get_fill_value(self, x): # categorical type if PreprocessingUtils.get_type(x) == PreprocessingUtils.CATEGORICAL: if self._na_fill_method == PreprocessingMissingValues.FILL_NA_MIN: return (PreprocessingMissingValues.MISSING_VALUE ) # add new categorical value return PreprocessingUtils.get_most_frequent(x) # numerical type if self._na_fill_method == PreprocessingMissingValues.FILL_NA_MIN: return PreprocessingUtils.get_min(x) - 1.0 if self._na_fill_method == PreprocessingMissingValues.FILL_NA_MEAN: return PreprocessingUtils.get_mean(x) return PreprocessingUtils.get_median(x)
def compute(X, y, machinelearning_task): columns_info = {} for col in X.columns: columns_info[col] = [] # empty_column = np.sum(pd.isnull(X[col]) == True) == X.shape[0] if empty_column: columns_info[col] += ["empty_column"] continue # constant_column = len(np.unique(X.loc[~pd.isnull(X[col]), col])) == 1 if constant_column: columns_info[col] += ["constant_column"] continue # if PreprocessingUtils.is_na(X[col]): columns_info[col] += ["missing_values"] # if PreprocessingUtils.is_categorical(X[col]): columns_info[col] += ["categorical"] columns_info[col] += [EncodingSelector.get(X, y, col)] elif PreprocessingUtils.is_datetime(X[col]): columns_info[col] += ["datetime_transform"] elif PreprocessingUtils.is_text(X[col]): columns_info[col] = ["text_transform" ] # override other transforms else: # numeric type, check if scale needed if PreprocessingUtils.is_scale_needed(X[col]): columns_info[col] += ["scale"] target_info = [] if machinelearning_task == BINARY_CLASSIFICATION: if not PreprocessingUtils.is_0_1(y): target_info += ["convert_0_1"] if machinelearning_task == REGRESSION: if PreprocessingUtils.is_log_scale_needed(y): target_info += ["scale_log"] elif PreprocessingUtils.is_scale_needed(y): target_info += ["scale"] num_class = None if machinelearning_task == MULTICLASS_CLASSIFICATION: num_class = PreprocessingUtils.num_class(y) return { "columns_info": columns_info, "target_info": target_info, "num_class": num_class, }
def fit(self, X, y, X_validation=None, y_validation=None, log_to_file=None): if self.cat_features is None: self.cat_features = [] for i in range(X.shape[1]): if PreprocessingUtils.is_categorical(X.iloc[:, i]): self.cat_features += [i] eval_set = None if X_validation is not None and y_validation is not None: eval_set = (X_validation, y_validation) self.model.fit( X, y, cat_features=self.cat_features, init_model=None if self.model.tree_count_ is None else self.model, eval_set=eval_set, early_stopping_rounds=self.early_stopping_rounds, verbose_eval=False, ) if log_to_file is not None: metric_name = list(self.model.evals_result_["learn"].keys())[0] result = pd.DataFrame( { "iteration": range(len(self.model.evals_result_["learn"][metric_name])), "train": self.model.evals_result_["learn"][metric_name], "validation": self.model.evals_result_["validation"][metric_name], } ) result.to_csv(log_to_file, index=False, header=False)
def _fit_na_fill(self, X): for column in self._columns: if np.sum(pd.isnull(X[column]) == True) == 0: continue self._na_fill_params[column] = self._get_fill_value(X[column]) if PreprocessingUtils.get_type(X[column]) == PreprocessingUtils.DATETIME: self._datetime_columns += [column]
def test_get_stats(self): tmp = np.array([1, np.nan, 2, 3, np.nan, np.nan]) self.assertEqual(1, PreprocessingUtils.get_min(tmp)) self.assertEqual(2, PreprocessingUtils.get_mean(tmp)) self.assertEqual(2, PreprocessingUtils.get_median(tmp)) d = {"col1": [1, 2, 1, 3, 1, np.nan], "col2": ["a", np.nan, "b", "a", "c", "a"]} df = pd.DataFrame(data=d) self.assertEqual(1, PreprocessingUtils.get_min(df["col1"])) self.assertEqual(8.0 / 5.0, PreprocessingUtils.get_mean(df["col1"])) self.assertEqual(1, PreprocessingUtils.get_median(df["col1"])) self.assertEqual(1, PreprocessingUtils.get_most_frequent(df["col1"])) self.assertEqual("a", PreprocessingUtils.get_most_frequent(df["col2"]))
def fit(self, X, y): if self.cat_features is None: self.cat_features = [] for i in range(X.shape[1]): if PreprocessingUtils.is_categorical(X.iloc[:, i]): self.cat_features += [i] self.model.fit( X, y, cat_features=self.cat_features, init_model=None if self.model.tree_count_ is None else self.model, )
def _fit_categorical_convert(self, X): for column in self._columns: if PreprocessingUtils.get_type( X[column]) != PreprocessingUtils.CATEGORICAL: # no need to convert, already a number continue # limit categories - it is needed when doing one hot encoding # this code is also used in predict.py file # and transform_utils.py # TODO it needs refactoring !!! too_much_categories = len(np.unique(list(X[column].values))) > 200 lbl = None if (self._convert_method == PreprocessingCategorical.CONVERT_ONE_HOT and not too_much_categories): lbl = LabelBinarizer() lbl.fit(X, column) else: lbl = LabelEncoder() lbl.fit(X[column]) if lbl is not None: self._convert_params[column] = lbl.to_json()
def fit( self, X, y, sample_weight=None, X_validation=None, y_validation=None, sample_weight_validation=None, log_to_file=None, max_time=None, ): if self.is_fitted(): print("CatBoost model already fitted. Skip fit().") return if self.cat_features is None: self.cat_features = [] for i in range(X.shape[1]): if PreprocessingUtils.is_categorical(X.iloc[:, i]): self.cat_features += [i] eval_set = None if X_validation is not None and y_validation is not None: eval_set = Pool( data=X_validation, label=y_validation, cat_features=self.cat_features, weight=sample_weight_validation, ) if self.params.get("num_boost_round") is None: model_init, new_iterations = self._assess_iterations( X, y, sample_weight, eval_set, max_time) self.model.set_params(iterations=new_iterations) else: model_init = None self.model.set_params( iterations=self.params.get("num_boost_round")) self.early_stopping_rounds = self.params.get( "early_stopping_rounds", 50) self.model.fit( X, y, sample_weight=sample_weight, cat_features=self.cat_features, init_model=model_init, eval_set=eval_set, early_stopping_rounds=self.early_stopping_rounds, verbose_eval=False, ) if self.model.best_iteration_ is not None: if model_init is not None: self.best_ntree_limit = (self.model.best_iteration_ + model_init.tree_count_ + 1) else: self.best_ntree_limit = self.model.best_iteration_ + 1 else: # just take all the trees # the warm-up trees are already included # dont need to add +1 self.best_ntree_limit = self.model.tree_count_ if log_to_file is not None: train_scores = self.model.evals_result_["learn"].get( self.log_metric_name) validation_scores = self.model.evals_result_["validation"].get( self.log_metric_name) if model_init is not None: if train_scores is not None: train_scores = (model_init.evals_result_["learn"].get( self.log_metric_name) + train_scores) if validation_scores is not None: validation_scores = ( model_init.evals_result_["validation"].get( self.log_metric_name) + validation_scores) iteration = None if train_scores is not None: iteration = range(len(validation_scores)) elif validation_scores is not None: iteration = range(len(validation_scores)) result = pd.DataFrame({ "iteration": iteration, "train": train_scores, "validation": validation_scores, }) result.to_csv(log_to_file, index=False, header=False)
def extensive_eda(X, y, save_path): # Check for empty dataframes in params if not isinstance(X, pd.DataFrame): raise ValueError("X should be a dataframe") if X.shape[0] != len(y): raise ValueError("X and y should have the same number of samples") if X.shape[1] > MAXCOL: X = X.iloc[:, :MAXCOL] warnings.warn( f"AutoML EDA column limit exceeded! running for first {MAXCOL} columns" ) if save_path: if not os.path.exists(save_path): os.mkdir(save_path) else: raise ValueError("Please provide a valid path to save the Extensive EDA") plt.style.use("ggplot") try: if PreprocessingUtils.get_type(y) in ("categorical", "discrete"): for col in X.columns: if PreprocessingUtils.get_type(X[col]) == "continous": plt.figure(figsize=(5, 5)) for i in np.unique(y): sns.kdeplot( x=X.iloc[np.where(y == i)[0]][col], label=f"class {i}", shade=True, ) plt.legend() plt.gca().set_title( f"Distribution of {col} for each class", fontsize=11, weight="bold", alpha=0.75, ) plt.savefig(EDA.plot_path(save_path, col + "_target")) elif PreprocessingUtils.get_type(X[col]) in ( "categorical", "discrete", ): if X[col].nunique() > 7: warnings.warn("Considering 7 the most frequent values") values = X[col].value_counts().index[:7] plt.figure(figsize=(5, 5)) sns.countplot( x=X[X[col].isin(values)][col], hue=y[X[col].isin(values)] ) plt.gca().set_title( f"Count plot of each {col}", fontsize=11, weight="bold", alpha=0.75, ) plt.savefig(EDA.plot_path(save_path, col + "_target")) elif PreprocessingUtils.get_type(y) == "continous": for col in X.columns: if PreprocessingUtils.get_type(X[col]) == "continous": plt.figure(figsize=(5, 5)) plt.scatter(X[col].values, y) plt.gca().set_xlabel(f"{col}") plt.gca().set_ylabel("target") plt.gca().set_title( f"Scatter plot of {col} vs target", fontsize=11, weight="bold", alpha=0.75, ) plt.savefig(EDA.plot_path(save_path, col + "_target")) elif PreprocessingUtils.get_type(X[col]) in ( "categorical", "discrete", ): if X[col].nunique() > 7: warnings.warn("Considering 7 the most frequent values") plt.figure(figsize=(5, 5)) for i in X[col].value_counts().index[:7]: sns.kdeplot( x=y[X[X[col] == i].index], shade=True, label=f"{col}_{i}", ) plt.gca().set_title( f"Distribution of target for each {col}", fontsize=11, weight="bold", alpha=0.75, ) plt.legend() plt.savefig(EDA.plot_path(save_path, col + "_target")) elif PreprocessingUtils.get_type(X[col]) == "datetime": plt.figure(figsize=(5, 5)) plt.plot(X[col], y) plt.gca().set_xticklabels(X[col].dt.date, rotation="45") plt.gca().set_title( f"Distribution of target over time", fontsize=11, weight="bold", alpha=0.75, ) plt.savefig(EDA.plot_path(save_path, col + "_target")) cols = [ col for col in X.columns if PreprocessingUtils.get_type(X[col]) == "continous" ][:COLS] if len(cols) > 0: plt.figure(figsize=(10, 10)) sns.heatmap(X[cols].corr()) plt.gca().set_title("Heatmap", fontsize=11, weight="bold", alpha=0.75) plt.savefig(os.path.join(save_path, "heatmap")) with open(os.path.join(save_path, "Extensive_EDA.md"), "w") as fout: for col in X.columns: fout.write(f"## Bivariate analysis of {col} feature with target\n") fout.write("\n![]({})\n".format(EDA.plot_fname(col + "_target"))) fout.write("\n") fout.write( "------------------------------------------------------\n" ) if len(cols) > 0: fout.write("## Heatmap\n") fout.write("![](heatmap.png)\n") fout.write("\n") fout.write( "------------------------------------------------------\n" ) except Exception as e: AutoMLException(e)
def test_get_type_numpy_number(self): tmp = np.array([1, 2, 3]) tmp_type = PreprocessingUtils.get_type(tmp) self.assertNotEqual(tmp_type, PreprocessingUtils.CATEGORICAL)
def get(required_preprocessing, data, machinelearning_task): X = data["train"]["X"] y = data["train"]["y"] columns_preprocessing = {} for col in X.columns: preprocessing_to_apply = [] # remove empty columns and columns with only one variable empty_column = np.sum(pd.isnull(X[col]) == True) == X.shape[0] constant_column = len(np.unique(X.loc[~pd.isnull(X[col]), col])) == 1 if empty_column or constant_column: preprocessing_to_apply += ["remove_column"] columns_preprocessing[col] = preprocessing_to_apply continue # always check for missing values if ( "missing_values_inputation" in required_preprocessing and PreprocessingUtils.is_na(X[col]) ): preprocessing_to_apply += [PreprocessingMissingValues.FILL_NA_MEDIAN] # convert to categorical only for categorical types convert_to_integer_will_be_applied = False if ( "convert_categorical" in required_preprocessing and PreprocessingUtils.is_categorical(X[col]) ): preprocessing_to_apply += [PreprocessingCategorical.CONVERT_INTEGER] convert_to_integer_will_be_applied = True if "scale" in required_preprocessing: if convert_to_integer_will_be_applied: preprocessing_to_apply += [PreprocessingScale.SCALE_NORMAL] # elif PreprocessingUtils.is_log_scale_needed(X[col]): # preprocessing_to_apply += [PreprocessingScale.SCALE_LOG_AND_NORMAL] elif PreprocessingUtils.is_scale_needed(X[col]): preprocessing_to_apply += [PreprocessingScale.SCALE_NORMAL] # remeber which preprocessing we need to apply if preprocessing_to_apply: columns_preprocessing[col] = preprocessing_to_apply target_preprocessing = [] # always remove missing values from target, # missing values might be in train and in validation datasets target_preprocessing += [PreprocessingMissingValues.NA_EXCLUDE] if machinelearning_task == BINARY_CLASSIFICATION: if not PreprocessingUtils.is_0_1(y): target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER] if machinelearning_task == MULTICLASS_CLASSIFICATION: if PreprocessingUtils.is_categorical(y): target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER] if machinelearning_task == REGRESSION: if PreprocessingUtils.is_log_scale_needed(y): target_preprocessing += [PreprocessingScale.SCALE_LOG_AND_NORMAL] elif PreprocessingUtils.is_scale_needed(y): target_preprocessing += [PreprocessingScale.SCALE_NORMAL] return { "columns_preprocessing": columns_preprocessing, "target_preprocessing": target_preprocessing, }
def fit(self, X, y, sample_weight=None, X_validation=None, y_validation=None, sample_weight_validation=None, log_to_file=None, max_time=None): if self.model.tree_count_ is not None: print("CatBoost model already fitted. Skip fit().") return if self.cat_features is None: self.cat_features = [] for i in range(X.shape[1]): if PreprocessingUtils.is_categorical(X.iloc[:, i]): self.cat_features += [i] eval_set = None if X_validation is not None and y_validation is not None: eval_set = Pool( data=X_validation, label=y_validation, cat_features=self.cat_features, weight=sample_weight_validation, ) # disable for now ... model_init, new_iterations = self._assess_iterations( X, y, eval_set, max_time) self.model.set_params(iterations=new_iterations) self.model.fit(X, y, sample_weight=sample_weight, cat_features=self.cat_features, init_model=model_init, eval_set=eval_set, early_stopping_rounds=self.early_stopping_rounds, verbose_eval=False) if self.model.best_iteration_ is not None: self.best_ntree_limit = self.model.best_iteration_ + self.warmup_iterations + 1 else: # just take all the trees # the warm-up trees are already included # dont need to add +1 self.best_ntree_limit = self.model.tree_count_ if log_to_file is not None: metric_name = list(self.model.evals_result_["learn"].keys())[0] train_scores = self.model.evals_result_["learn"][metric_name] validation_scores = self.model.evals_result_["validation"][ metric_name] if model_init is not None: train_scores = model_init.evals_result_["learn"][ metric_name] + train_scores validation_scores = model_init.evals_result_["validation"][ metric_name] + validation_scores result = pd.DataFrame({ "iteration": range(len(train_scores)), "train": train_scores, "validation": validation_scores, }) result.to_csv(log_to_file, index=False, header=False)
def get(required_preprocessing, data, machinelearning_task): X = data["train"]["X"] y = data["train"]["y"] columns_preprocessing = {} for col in X.columns: preprocessing_to_apply = [] # remove empty columns and columns with only one variable empty_column = np.sum(pd.isnull(X[col]) == True) == X.shape[0] constant_column = len(np.unique(X.loc[~pd.isnull(X[col]), col])) == 1 if empty_column or constant_column: preprocessing_to_apply += ["remove_column"] columns_preprocessing[col] = preprocessing_to_apply continue # always check for missing values if ( "missing_values_inputation" in required_preprocessing and PreprocessingUtils.is_na(X[col]) ): preprocessing_to_apply += [PreprocessingMissingValues.FILL_NA_MEDIAN] # convert to categorical only for categorical types convert_to_integer_will_be_applied = False if ( "convert_categorical" in required_preprocessing and PreprocessingUtils.is_categorical(X[col]) ): preprocessing_to_apply += [PreprocessingCategorical.CONVERT_INTEGER] convert_to_integer_will_be_applied = True if "scale" in required_preprocessing: if convert_to_integer_will_be_applied: preprocessing_to_apply += [Scale.SCALE_NORMAL] # elif PreprocessingUtils.is_log_scale_needed(X[col]): # preprocessing_to_apply += [Scale.SCALE_LOG_AND_NORMAL] elif PreprocessingUtils.is_scale_needed(X[col]): preprocessing_to_apply += [Scale.SCALE_NORMAL] # remeber which preprocessing we need to apply if preprocessing_to_apply: columns_preprocessing[col] = preprocessing_to_apply target_preprocessing = [] # always remove missing values from target, # target with missing values might be in the train and in the validation datasets target_preprocessing += [PreprocessingMissingValues.NA_EXCLUDE] if "target_as_integer" in required_preprocessing: if machinelearning_task == BINARY_CLASSIFICATION: if not PreprocessingUtils.is_0_1(y): target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER] if machinelearning_task == MULTICLASS_CLASSIFICATION: # if PreprocessingUtils.is_categorical(y): # always convert to integer, there can be many situations that can break # for example, classes starting from 1, ... # or classes not for every number, for example 0,2,3,4 # just always convert target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER] elif "target_as_one_hot" in required_preprocessing: target_preprocessing += [PreprocessingCategorical.CONVERT_ONE_HOT] if ( machinelearning_task == REGRESSION and "target_scale" in required_preprocessing ): if PreprocessingUtils.is_log_scale_needed(y): target_preprocessing += [Scale.SCALE_LOG_AND_NORMAL] elif PreprocessingUtils.is_scale_needed(y): target_preprocessing += [Scale.SCALE_NORMAL] """ if machinelearning_task == BINARY_CLASSIFICATION: if not PreprocessingUtils.is_0_1(y): target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER] if machinelearning_task == MULTICLASS_CLASSIFICATION: if PreprocessingUtils.is_categorical(y): target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER] """ return { "columns_preprocessing": columns_preprocessing, "target_preprocessing": target_preprocessing, "ml_task": machinelearning_task, }
def optimize( self, algorithm, data_type, X_train, y_train, sample_weight, X_validation, y_validation, sample_weight_validation, learner_params, ): # only tune models with original data type if data_type != "original": return learner_params key = f"{data_type}_{algorithm}" if key in self.tuning: return self.update_learner_params(learner_params, self.tuning[key]) if self.verbose: print( f"Optuna optimizes {algorithm} with time budget {self.time_budget} seconds " f"eval_metric {self.eval_metric.name} ({self.direction})") self.cat_features_indices = [] for i in range(X_train.shape[1]): if PreprocessingUtils.is_categorical(X_train.iloc[:, i]): self.cat_features_indices += [i] study = optuna.create_study( direction=self.direction, sampler=optuna.samplers.TPESampler(seed=self.random_state), pruner=optuna.pruners.MedianPruner( n_warmup_steps=self.n_warmup_steps), ) obejctive = None if algorithm == "LightGBM": objective = LightgbmObjective( self.ml_task, X_train, y_train, sample_weight, X_validation, y_validation, sample_weight_validation, self.eval_metric, self.cat_features_indices, self.n_jobs, self.random_state, ) elif algorithm == "Xgboost": objective = XgboostObjective( self.ml_task, X_train, y_train, sample_weight, X_validation, y_validation, sample_weight_validation, self.eval_metric, self.n_jobs, self.random_state, ) elif algorithm == "CatBoost": objective = CatBoostObjective( self.ml_task, X_train, y_train, sample_weight, X_validation, y_validation, sample_weight_validation, self.eval_metric, self.cat_features_indices, self.n_jobs, self.random_state, ) elif algorithm == "Random Forest": objective = RandomForestObjective( self.ml_task, X_train, y_train, sample_weight, X_validation, y_validation, sample_weight_validation, self.eval_metric, self.n_jobs, self.random_state, ) elif algorithm == "Extra Trees": objective = ExtraTreesObjective( self.ml_task, X_train, y_train, sample_weight, X_validation, y_validation, sample_weight_validation, self.eval_metric, self.n_jobs, self.random_state, ) elif algorithm == "Nearest Neighbors": objective = KNNObjective( self.ml_task, X_train, y_train, sample_weight, X_validation, y_validation, sample_weight_validation, self.eval_metric, self.n_jobs, self.random_state, ) elif algorithm == "Neural Network": objective = NeuralNetworkObjective( self.ml_task, X_train, y_train, sample_weight, X_validation, y_validation, sample_weight_validation, self.eval_metric, self.n_jobs, self.random_state, ) study.optimize(objective, n_trials=5000, timeout=self.time_budget) self.plot_study(algorithm, data_type, study) joblib.dump(study, os.path.join(self.study_dir, key + ".joblib")) best = study.best_params if algorithm == "LightGBM": best["metric"] = objective.eval_metric_name best["custom_eval_metric_name"] = objective.custom_eval_metric_name best["num_boost_round"] = objective.rounds best["early_stopping_rounds"] = objective.early_stopping_rounds # best["learning_rate"] = objective.learning_rate best["cat_feature"] = self.cat_features_indices best["feature_pre_filter"] = False best["seed"] = objective.seed elif algorithm == "CatBoost": best["eval_metric"] = objective.eval_metric_name best["num_boost_round"] = objective.rounds best["early_stopping_rounds"] = objective.early_stopping_rounds # best["bootstrap_type"] = "Bernoulli" # best["learning_rate"] = objective.learning_rate best["seed"] = objective.seed elif algorithm == "Xgboost": best["objective"] = objective.objective best["eval_metric"] = objective.eval_metric_name # best["eta"] = objective.learning_rate best["max_rounds"] = objective.rounds best["early_stopping_rounds"] = objective.early_stopping_rounds best["seed"] = objective.seed elif algorithm == "Extra Trees": # Extra Trees are not using early stopping best["max_steps"] = objective.max_steps # each step has 100 trees best["seed"] = objective.seed best["eval_metric_name"] = self.eval_metric.name elif algorithm == "Random Forest": # Random Forest is not using early stopping best["max_steps"] = objective.max_steps # each step has 100 trees best["seed"] = objective.seed best["eval_metric_name"] = self.eval_metric.name elif algorithm == "Nearest Neighbors": best["rows_limit"] = 100000 elif algorithm == "Neural Network": pass self.tuning[key] = best self.save() return self.update_learner_params(learner_params, best)
def compute(X_train, y_train, eda_path): try: # check if exists if os.path.exists(eda_path): # probably the EDA analysis is already done # skip from here return else: # need to create directory for EDA analysis os.mkdir(eda_path) inform = defaultdict(list) if isinstance(y_train, pd.Series): if PreprocessingUtils.get_type(y_train) in ("categorical"): plt.figure(figsize=(5, 5)) sns.countplot(y_train, color=BLUE) plt.title("Target class distribution") plt.tight_layout(pad=2.0) plot_path = os.path.join(eda_path, "target.png") plt.savefig(plot_path) plt.close("all") else: plt.figure(figsize=(5, 5)) sns.distplot(y_train, color=BLUE) plt.title("Target class distribution") plt.tight_layout(pad=2.0) plot_path = os.path.join(eda_path, "target.png") plt.savefig(plot_path) plt.close("all") inform["missing"].append( pd.isnull(y_train).sum() / y_train.shape[0]) inform["unique"].append(y_train.nunique()) inform["feature_type"].append( PreprocessingUtils.get_type(y_train)) inform["plot"].append("![](target.png)") inform["feature"].append("target") inform["desc"].append(y_train.describe().to_dict()) for col in X_train.columns: inform["feature_type"].append( PreprocessingUtils.get_type(X_train[col])) if PreprocessingUtils.get_type(X_train[col]) in ( "categorical", "discrete", ): plt.figure(figsize=(5, 5)) chart = sns.countplot( X_train[col], order=X_train[col].value_counts().iloc[:10].index, color=BLUE, ) chart.set_xticklabels(chart.get_xticklabels(), rotation=90) plt.title(f"{col} class distribution") plt.tight_layout(pad=2.0) plot_path = os.path.join(eda_path, f"{col}.png") plt.savefig(plot_path) plt.close("all") elif PreprocessingUtils.get_type( X_train[col]) in ("continous"): plt.figure(figsize=(5, 5)) sns.distplot(X_train[col], color=BLUE) plt.title(f"{col} value distribution") plt.tight_layout(pad=2.0) plot_path = os.path.join(eda_path, f"{col}.png") plt.savefig(plot_path) plt.close("all") elif PreprocessingUtils.get_type(X_train[col]) in ("text"): plt.figure(figsize=(10, 10), dpi=70) word_string = " ".join(X_train[col].str.lower()) wordcloud = WordCloud( width=500, height=500, stopwords=STOPWORDS, background_color="white", max_words=400, max_font_size=None, ).generate(word_string) plt.imshow(wordcloud, aspect="auto", interpolation="nearest") plt.axis("off") plot_path = os.path.join(eda_path, f"{col}.png") plt.savefig(plot_path) elif PreprocessingUtils.get_type(X_train[col]) in ("datetime"): plt.figure(figsize=(5, 5)) pd.to_datetime(X_train[col]).plot(grid="True", color=BLUE) plt.tight_layout(pad=2.0) plot_path = os.path.join(eda_path, f"{col}.png") plt.savefig(plot_path) plt.close("all") inform["missing"].append( pd.isnull(X_train[col]).sum() * 100 / X_train.shape[0]) inform["unique"].append(int(X_train[col].nunique())) inform["plot"].append(f"![]({col}.png)") inform["feature"].append(str(col)) inform["desc"].append(X_train[col].describe().to_dict()) df = pd.DataFrame(inform) with open(os.path.join(eda_path, "README.md"), "w") as fout: for i, row in df.iterrows(): fout.write(f"## Feature : {row['feature']}\n") fout.write(f"- **Feature type** : {row['feature_type']}\n") fout.write(f"- **Missing** : {row['missing']}%\n") fout.write(f"- **Unique** : {row['unique']}\n") for key in row["desc"].keys(): if key in ("25%", "50%", "75%"): fout.write( f"- **{key.capitalize()}th Percentile** : {row['desc'][key]}\n" ) else: fout.write( f"- **{key.capitalize()}** :{row['desc'][key]}\n" ) fout.write(f"- {row['plot']}\n") fout.close() except Exception as e: logger.error(f"There was an issue when running EDA. {str(e)}")
def test_get_type_numpy_categorical(self): tmp = np.array(["a", "b", "c"]) tmp_type = PreprocessingUtils.get_type(tmp) self.assertEqual(tmp_type, PreprocessingUtils.CATEGORICAL)