def jointplots_df(self,df0, idx_target=0): columns = df0.columns.to_numpy() target = columns[idx_target] idx = numpy.delete(numpy.arange(0, len(columns)), idx_target) pal = 'tab10' for i in range(len(idx)): c = columns[idx[i]] df = df0[[target, c]] df = df.dropna() df = tools_DF.hash_categoricals(df) fig = plt.figure() fig = self.turn_light_mode(fig) plt.grid(color=self.clr_grid) J = seaborn.histplot(data=df, x=c, hue=target, palette=pal,element='poly',legend=True) legend = J._get_patches_for_fill.axes.legend_ self.recolor_legend_seaborn(legend) plt.savefig(self.folder_out + 'plot_%02d_%02d_%s.png' % (i, i,c),facecolor=fig.get_facecolor()) plt.close() for i in range(len(idx)-1): for j in range(i+1,len(idx)): c1, c2 = columns[idx[i]], columns[idx[j]] df = df0[[target, c1, c2]] df = df.dropna() df = tools_DF.hash_categoricals(df) fig = plt.figure() fig = self.turn_light_mode(fig) J = seaborn.jointplot(data=df, x=c1, y=c2, hue=target,palette=pal,edgecolor=None) J.ax_joint.grid(color=self.clr_grid) J.ax_joint.set_facecolor(self.clr_bg) J.ax_marg_x.set_facecolor(self.clr_bg) J.ax_marg_y.set_facecolor(self.clr_bg) J.ax_joint.xaxis.label.set_color(self.clr_font) J.ax_joint.yaxis.label.set_color(self.clr_font) legend = J.ax_joint.legend() self.recolor_legend_plt(legend) plt.savefig(self.folder_out + 'pairplot_%02d_%02d_%s_%s.png'%(i,j,c1,c2),facecolor=fig.get_facecolor()) plt.close(fig) return
def get_roc(self, df0, idx_target, plots_train, plots_test): columns = df0.columns.to_numpy() target = columns[idx_target] df = df0.dropna() df[target] = (df[target] <= 0).astype(int) df = tools_DF.hash_categoricals(df) for i, C in enumerate([ classifier_LM.classifier_LM(), classifier_SVM.classifier_SVM(), classifier_RF.classifier_RF(), classifier_KNN.classifier_KNN() ]): ML = tools_ML_v2.ML(C, self.folder_out, self.P.dark_mode) ML.E2E_train_test_df(df, idx_target, do_pca=False) URL = next(tempfile._get_candidate_names()) + '.png' os.rename(self.folder_out + 'ROC_train.png', self.folder_out + URL) plots_train[i] = [html.Img(src=self.app.get_asset_url(URL))] URL = next(tempfile._get_candidate_names()) + '.png' os.rename(self.folder_out + 'ROC_test.png', self.folder_out + URL) plots_test[i] = [html.Img(src=self.app.get_asset_url(URL))] return plots_train, plots_test
def ex_14_hash_categoricasl(df): df = df.dropna() print(df.head()) print() df = tools_DF.hash_categoricals(df) print(df.head()) return
def get_pairplots(self, df0, idx_target, pairplots): FI = tools_feature_importance.evaluate_feature_importance( df0, idx_target) best_idx = numpy.argsort(-FI['F_score'].to_numpy()) best_features = FI['features'].to_numpy()[best_idx][:4] target = df0.columns[idx_target] cnt = 0 for i in range(len(best_features)): for j in range(i + 1, len(best_features)): c1, c2 = best_features[i], best_features[j] df = df0[[target, c1, c2]] df = df.dropna() df[target] = (df[target] <= 0).astype(int) df = tools_DF.hash_categoricals(df) URL = next(tempfile._get_candidate_names()) + '.png' self.P.plot_2D_features_v3(df, remove_legend=True, add_noice=True, transparency=0.75, filename_out=URL) pairplots[cnt] = [html.Img(src=self.app.get_asset_url(URL))] cnt += 1 if cnt == 4: break if cnt == 4: break return pairplots
def get_density(self, df0, idx_target, plots_dnst): FI = tools_feature_importance.evaluate_feature_importance( df0, idx_target) best_idx = numpy.argsort(-FI['F_score'].to_numpy()) best_features = FI['features'].to_numpy()[best_idx][:4] target = df0.columns[idx_target] df = df0[[target, best_features[0], best_features[1]]] df = df.dropna() df[target] = (df[target] <= 0).astype(int) df = tools_DF.hash_categoricals(df) for i, C in enumerate([ classifier_LM.classifier_LM(), classifier_SVM.classifier_SVM(), classifier_RF.classifier_RF(), classifier_KNN.classifier_KNN() ]): ML = tools_ML_v2.ML(C, self.folder_out, self.P.dark_mode) ML.E2E_train_test_df(df, 0, do_pca=False) ML.plot_density_2d(df, idx_target=0, N=30, filename_out='density.png') URL = next(tempfile._get_candidate_names()) + '.png' os.rename(self.folder_out + 'density.png', self.folder_out + URL) plots_dnst[i] = [html.Img(src=self.app.get_asset_url(URL))] return plots_dnst
def plot_TS_separatly(self, df, idx_target): df = tools_DF.hash_categoricals(df) for i, feature in enumerate(df.columns): #color = seaborn.color_palette(palette='Dark2')[0] if i == idx_target else None self.TS_matplotlib(df, idxs_target=[i], idx_feature=None, filename_out='%s.png' % feature) return
def get_data_titanic(): df = seaborn.load_dataset('titanic') df = df.dropna() df = tools_DF.hash_categoricals(df) target, c1, c2 = 'survived', 'sex', 'deck' X = df.loc[:, [c1, c2]].to_numpy() Y = df.loc[:, [target]].to_numpy().flatten() return X, Y
def preprocess(df, idx_target): df = df.dropna() df = tools_DF.hash_categoricals(df) columns = df.columns.to_numpy() idx = numpy.delete(numpy.arange(0, len(columns)), idx_target) X = df.iloc[:, idx].to_numpy() Y = df.iloc[:, idx_target].to_numpy() return X, Y
def ex_01_ugly(): df0 = seaborn.load_dataset('titanic') target, c1, c2 = 'survived', 'sex', 'age' df = df0[[target, c1, c2]] df = df.dropna() df = tools_DF.hash_categoricals(df) seaborn.jointplot(data=df, x=c1, y=c2, hue=target, kind="kde", fill=True) plt.show() return
def ex_view_tree(df,idx_target): df = df.dropna() df = tools_DF.hash_categoricals(df) X, Y = tools_DF.df_to_XY(df, idx_target, keep_categoirical=False) columns = df.columns.to_numpy() idx = numpy.delete(numpy.arange(0, len(columns)), idx_target) columns = columns[idx] C = classifier_DTree.classifier_DT(max_depth=3,folder_out=folder_out) C.learn(X, Y,columns,do_debug=True) return
def plot_all_in_one(df0, idx_target): df0 = tools_DF.hash_categoricals(df0) df0 = tools_DF.scale(df0) target = df0.columns[idx_target] features = df0.columns.to_numpy()[numpy.delete( numpy.arange(0, df0.shape[1]), idx_target)] FI = tools_feature_importance.feature_imporance_F_score(df0, idx_target) best_idx = numpy.argsort(-FI) best_features = features[best_idx][:4] df = df0[[target] + best_features.tolist()] df = tools_DF.hash_categoricals(df) P.TS_seaborn(df, numpy.arange(1, df.shape[1]).tolist(), None, filename_out='all_best_features.png') P.TS_seaborn(df, 0, None, filename_out='target.png') return
def ex_VIF(df): df = tools_DF.hash_categoricals(df) df = df.dropna() columns = df.columns.to_numpy() VIFs = numpy.array( [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]) idx = numpy.argsort(VIFs) for i in idx: print('%1.2f\t%s' % (VIFs[i], columns[i])) return
def pairplots_df(self,df0, idx_target=0,cumul_mode=False,add_noise=True): f_handle = open(self.folder_out + "descript.ion", "w+") f_handle.close() columns = df0.columns.to_numpy() target = columns[idx_target] idx = numpy.delete(numpy.arange(0, len(columns)), idx_target) transparency = 0.95 if add_noise else 0 for i in range(len(idx)-1): for j in range(i+1,len(idx)): c1, c2 = columns[idx[i]], columns[idx[j]] df = df0[[target,c1,c2]] df = df.dropna() df = tools_DF.hash_categoricals(df) I = int(100*mutual_info_classif(df.iloc[:,[1, 2]], df.iloc[:,0]).sum()) file_out = 'pairplot_%02d_%02d_%s_%s_%02d.png' % (i, j, c1, c2, I) if cumul_mode: self.plot_2D_features_cumul(df, remove_legend=True,filename_out=file_out) else: self.plot_2D_features_v3(df, add_noice=add_noise,transparency=transparency,remove_legend=True,filename_out=file_out) f_handle = open(self.folder_out + "descript.ion", "a+") f_handle.write("%s %s\n" % (file_out, '%03d'%I)) f_handle.close() for i in range(len(idx)): c1 = columns[idx[i]] df = df0[[target, c1]] df = df.dropna() df = tools_DF.hash_categoricals(df) bins = numpy.arange(-0.5, df[[c1]].max() + 0.5, 0.25) self.plot_1D_features_pos_neg(df[[c1]].to_numpy(), df[target].to_numpy(), labels=True, bins=bins,filename_out='plot_%02d_%02d_%s.png' % (i, i,c1)) return
def ex3(): df, idx_target = pd.read_csv(folder_in + 'dataset_titanic.csv', sep='\t'), 0 df = df.dropna() df = tools_DF.hash_categoricals(df) columns = df.columns target = columns[idx_target] idx = numpy.delete(numpy.arange(0, len(columns)), idx_target) for i1 in range(len(idx) - 1): for i2 in range(i1 + 1, len(idx)): c1, c2 = columns[idx[i1]], columns[idx[i2]] I = tools_DF.get_Mutual_Information(df,idx_target,idx[i1],idx[i2]) #I = mutual_info_classif(df[[c1, c2]], df[target]).sum() print(c1,c2,I) return
def ex_VIF2(df): df = tools_DF.hash_categoricals(df) df = df.dropna() columns = df.columns VIFs = [] for i in range(0, columns.shape[0]): y = df[columns[i]] x = df[columns.drop([columns[i]])] r2 = OLS(y, x).fit().rsquared vif = round(1 / (1 - r2), 2) VIFs.append(vif) idx = numpy.argsort(VIFs) for i in idx: print('%1.2f\t%s' % (VIFs[i], columns[i])) return
def get_pc(self, df0, idx_target, pca_plots): columns = df0.columns.to_numpy() target = columns[idx_target] df = df0.dropna() df[target] = (df[target] <= 0).astype(int) df = tools_DF.hash_categoricals(df) self.P.plot_SVD(df, idx_target, 'dim_SVD.png') self.P.plot_PCA(df, idx_target, 'dim_PCA.png') self.P.plot_tSNE(df, idx_target, 'dim_tSNE.png') self.P.plot_ISOMAP(df, idx_target, 'dim_ISOMAP.png') for i, filename in enumerate( ['dim_SVD.png', 'dim_PCA.png', 'dim_tSNE.png', 'dim_ISOMAP.png']): URL = next(tempfile._get_candidate_names()) + '.png' os.rename(self.folder_out + filename, self.folder_out + URL) pca_plots[i] = [html.Img(src=self.app.get_asset_url(URL))] return pca_plots
def ex_feature_correlation(df): df = tools_DF.hash_categoricals(df) columns = df.columns.to_numpy() corrmat = abs(df.corr()).to_numpy() for i in range(corrmat.shape[0]): corrmat[i, i] = 0 ranks = [] while len(ranks) < corrmat.shape[1]: idx = numpy.argmax(corrmat) r, c = numpy.unravel_index(idx, corrmat.shape) corrmat[r, c] = 0 if r not in ranks: ranks.append(r) if c not in ranks: ranks.append(c) ranks = numpy.array(ranks) corrmat = abs(df[columns[ranks]].corr()) for i in range(corrmat.shape[0]): corrmat.iloc[i, i] = numpy.nan plt.figure(figsize=(12, 8)) sns.heatmap(corrmat, vmax=1, square=True, annot=True, fmt='.2f', cmap='GnBu', cbar_kws={"shrink": .5}, robust=True) plt.savefig(folder_out + 'corr.png') return
features = df0.columns.to_numpy()[numpy.delete( numpy.arange(0, df0.shape[1]), idx_target)] FI = tools_feature_importance.feature_imporance_F_score(df0, idx_target) best_idx = numpy.argsort(-FI) best_features = features[best_idx][:4] df = df0[[target] + best_features.tolist()] df = tools_DF.hash_categoricals(df) P.TS_seaborn(df, numpy.arange(1, df.shape[1]).tolist(), None, filename_out='all_best_features.png') P.TS_seaborn(df, 0, None, filename_out='target.png') return # ---------------------------------------------------------------------------------------------------------------------- df, idx_target = pd.read_csv(folder_in + 'traffic_hourly_small.txt', delimiter=','), 1 #df, idx_target = pd.read_csv(folder_in + 'electricity_hourly_small.txt', delimiter=','), 1 # ---------------------------------------------------------------------------------------------------------------------- if __name__ == '__main__': df = tools_DF.hash_categoricals(df) P.plot_TS_separatly(df, idx_target) #plot_all_in_one(df, idx_target) #P.plot_target_feature(df, idx_target)