def preprocess_alt(df, to_clf, red_corr=True, tr=0.90, n=3, normalization=None ,ignore_classes=[], mean_by=None,remove_outliers=None, train_ratio=0.7, as_df =False,random_state=42): data = df.copy() if mean_by != None: data = mean_df(data, mean_by) if normalization != None: data = normalize(data, ignore_classes,to_clf, norm=normalization) if red_corr: data = red_correlations(data, to_clf,tr=tr, n=n) if remove_outliers != None: outliers_row = eval.outliers_category(data, data.columns, ratio=1.5, by="row") remove = [n for n, i in enumerate(outliers_row) if i > remove_outliers] data = data.drop(remove) #data = data.reset_index() print(data.head()) #if bal != None: #data = balance(data, to_clf, bal,train_ratio, random_state) if as_df: return data else: data = split_dataset(data, to_clf, train_ratio, random_state) return data
plt.show() """ #Redução piora um bocado #%% #df = datapp.subsample(data, to_clf) trnX, tstX, valX, trnY, tstY, valY = datapp.subsample_split( data, to_clf, categoric, "standard") print(trnX.shape) print(tstX.shape) print(valX.shape) #%% outliers_col = eval.outliers_category(data, data.columns.tolist(), ratio=1.5, by="column") outliers_row = eval.outliers_category(data, data.columns.tolist(), ratio=1.5, by="row") plt.figure() bin = 20 plt.hist(outliers_col, bin) plt.title = "Frequencies of quantity of outliers (by column)" plt.show() plt.figure() bin = 20 plt.hist(outliers_row, bin, cumulative=False) plt.title = "Frequencies of quantity of outliers (by row)" plt.show()