def point_point_plot(df, columns): create_folder("outs\\point2point") for i in range(len(columns)): for j in range(i + 1, len(columns)): g = sns.jointplot(x=columns[i], y=columns[j], data=df) plt.savefig("outs\\point2point\\{}--{}.png".format( columns[i], columns[j])) plt.close()
def hex_bin(df, columns): create_folder("outs\\hex_bin") for i in range(len(columns)): for j in range(i + 1, len(columns)): g = sns.jointplot(x=columns[i], y=columns[j], data=df, kind="hex") plt.savefig("outs\\hex_bin\\{}--{}.png".format( columns[i], columns[j])) plt.close()
def self_hist2d(df, columns): create_folder("outs\\self_hist2d") for i in range(len(columns)): plt.hist2d(df[columns[i]], df[columns[i]], (50, 50), cmin=1) plt.colorbar() plt.xlabel(columns[i]) plt.savefig("outs\\self_hist2d\\{}.png".format(columns[i])) plt.close()
def hist2d(df, columns): create_folder("outs\\hist2d") for i in range(len(columns)): for j in range(i + 1, len(columns)): plt.hist2d(df[columns[i]], df[columns[j]], (50, 50), cmin=1) plt.colorbar() plt.savefig("outs\\hist2d\\{}--{}.png".format( columns[i], columns[j])) plt.close()
def pre_processing(df: DataFrame): """ input : a data frame outputs: clean data frame dtype.txt : a file that has type of each columns database:information.sqlite tables: information : clean data frame before_process : data before process missing_information : information of missing_data function output outliers : outliers data describe : describe of clean data Description: delete null information merge capital_gain and capital_loss delete education column delete outlier information with IQR method save information in database """ sql_manager = SqlManager("information.sqlite") df.to_sql(name="before_process", con=sql_manager.conn, if_exists="replace") missing_data_df = missing_data(df) missing_data_df.to_sql(name="missing_information", con=sql_manager.conn, if_exists="replace") df = df.drop(columns=[ "status_id", "status_published", 'Column1', "Column2", "Column3", "Column4" ]) main_df = df.dropna() print(main_df.shape) outliers_df, main_df = drop_numerical_outliers(main_df) main_df = main_df[main_columns] outliers_df.to_sql(name="outliers", con=SqlManager("information.sqlite").conn, if_exists="replace", index=False) main_df.to_sql(name="after_clear", con=SqlManager("information.sqlite").conn, if_exists="replace", index=False) label_encode(main_df) scaled_df = DataFrame(preprocessing.robust_scale(main_df), columns=main_columns) scaled_df.to_sql(name="information", con=SqlManager("information.sqlite").conn, if_exists="replace", index=False) print(main_df.shape) main_df.describe().to_sql(name="describe", con=sql_manager.conn, if_exists='replace') create_folder("outs") with open("outs\\dtypes.txt", "w") as file: file.write(str(main_df.dtypes)) return main_df
def density(df, columns): create_folder("outs\\density") for i in range(len(columns)): for j in range(i + 1, len(columns)): g = sns.jointplot(x=columns[i], y=columns[j], data=df, kind="kde") # g.plot_joint(plt.scatter, c="w", s=30, linewidth=1, marker="+") # g.ax_joint.collections[0].set_alpha(0) plt.savefig("outs\\density\\{}--{}.png".format( columns[i], columns[j])) plt.close()
def db_scan_plots(df): create_folder("outs\\MainDBSCAN") for i in range(len(columns)): for j in range(i + 1, len(columns)): print(columns[i], " ", columns[j]) plt.scatter(df[columns[i]], df[columns[j]], c=df["cluster"]) plt.xlabel = columns[i] plt.ylabel = columns[j] plt.savefig("outs\\MainDBSCAN\\{}---{}.png".format(columns[i], columns[j])) plt.close()
def k_means_plots(df, centers): create_folder("outs\\MainKMeans") for i in range(len(columns)): for j in range(i + 1, len(columns)): print(columns[i], " ", columns[j]) plt.scatter(df[columns[i]], df[columns[j]], c=df["cluster"]) x_centers = [x[i] for x in centers] y_centers = [y[j] for y in centers] plt.scatter(x_centers, y_centers, c="r", marker="+", s=200) plt.xlabel = columns[i] plt.ylabel = columns[j] plt.savefig("outs\\MainKMeans\\{}---{}.png".format( columns[i], columns[j])) plt.close()
def db_scan_each_2_columns(df): plt.close() create_folder("outs\\DBSCAN_each2columns") for i in range(len(columns)): for j in range(i + 1, len(columns)): print(columns[i], " ", columns[j]) samples = df[[columns[i], columns[j]]].copy() db_scan = DBSCAN() db_scan.fit(samples) samples["cluster"] = db_scan.labels_ plt.scatter(samples[columns[i]], samples[columns[j]], c=samples["cluster"]) plt.xlabel = columns[i] plt.ylabel = columns[j] plt.savefig("outs\\DBSCAN_each2columns\\{}---{}.png".format(columns[i], columns[j])) plt.close()
def pie_plots(columns_name): for col in columns_name: result = sql_manager.crs.execute(( "select distinct {},count({}) from information group by {}".format( col, col, col))).fetchall() counts = [x[1] for x in result] attr = [x[0] for x in result] fig1, ax1 = plt.subplots() ax1.pie(counts, labels=attr, autopct='%1.1f%%', shadow=True, startangle=90) ax1.axis('equal' ) # Equal aspect ratio ensures that pie is drawn as a circle. create_folder("outs\\pie_plots") plt.savefig("outs\\pie_plots\\{}.png".format(col)) plt.close()
def agglomerative_each_2_columns(df, k): plt.close() create_folder("outs\\agglomerative_each2columns") for i in range(len(columns)): for j in range(i + 1, len(columns)): print(columns[i], " ", columns[j]) samples = df[[columns[i], columns[j]]].copy() agglomerative = AgglomerativeClustering(n_clusters=k) agglomerative.fit(samples) samples["cluster"] = agglomerative.labels_ plt.scatter(samples[columns[i]], samples[columns[j]], c=samples["cluster"]) plt.xlabel = columns[i] plt.ylabel = columns[j] plt.savefig("outs\\agglomerative_each2columns\\{}---{}.png".format( columns[i], columns[j])) plt.close()
def k_means_each_2_columns(df, k): plt.close() create_folder("outs\\KMeans_each2columns") for i in range(len(columns)): for j in range(i + 1, len(columns)): print(columns[i], " ", columns[j]) samples = df[[columns[i], columns[j]]].copy() k_means = KMeans(n_clusters=k, random_state=5) k_means.fit(samples) samples["cluster"] = k_means.labels_ plt.scatter(samples[columns[i]], samples[columns[j]], c=samples["cluster"]) centers = k_means.cluster_centers_ x_centers = [x[0] for x in centers] y_centers = [y[1] for y in centers] plt.scatter(x_centers, y_centers, c="r", marker="+", s=200) plt.xlabel = columns[i] plt.ylabel = columns[j] plt.savefig("outs\\KMeans_each2columns\\{}---{}.png".format( columns[i], columns[j])) plt.close()
def diff(df, cols): for col in cols: df[col].diff().hist() create_folder("outs\\diff_hists") plt.savefig("outs\\diff_hists\\{}.png".format(col)) plt.close()
def boxes(columns_name, df): for col in columns_name: df[col].plot.box() create_folder("outs\\boxes") plt.savefig("outs\\boxes\\{}.png".format(col)) plt.close()