def normalize_data_2d(): input_values, contaminated = ia_n2020.ioValues() predict_values = ia_n2020.newData() norm = StandardScaler() input_values_norm = norm.fit_transform(input_values) predict_values_norm = norm.fit_transform(predict_values) pca = PCA(n_components=2) input_values_2d = pca.fit_transform(input_values_norm) predict_values_2d = pca.fit_transform(predict_values_norm) reg1 = '' reg2 = '' for i in range(len(input_values_2d)): if contaminated[i] == 0: reg1 = plt.scatter(input_values_2d[i][0], input_values_2d[i][1], marker='x', color='g') elif contaminated[i] == 1: reg2 = plt.scatter(input_values_2d[i][0], input_values_2d[i][1], marker='o', color='b') plt.xlabel('PC1') plt.ylabel('PC2') plt.grid(True) plt.legend((reg1, reg2), ('Não contaminado', 'Contaminado')) plt.savefig('graphs/pca_graph.png') plt.close() return input_values_2d, predict_values_2d
def normalize_data_3d(): input_values, contaminated = ia_n2020.ioValues() predict_values = ia_n2020.newData() norm = StandardScaler() input_values_norm = norm.fit_transform(input_values) predict_values_norm = norm.fit_transform(predict_values) pca = PCA(n_components=3) input_values_3d = pca.fit_transform(input_values_norm) predict_values_3d = pca.fit_transform(predict_values_norm) return input_values_3d, predict_values_3d
from sklearn.cluster import KMeans import matplotlib.pyplot as plt import ia_n2020 import pca_graph import pandas as pd input_values, contaminated = ia_n2020.ioValues() input_values_2d, predict_values_2d = pca_graph.normalize_data_2d() inertia = [] for i in range(1, 11): kmeans = KMeans(n_clusters=i) kmeans.fit(input_values_2d) inertia.append(kmeans.inertia_) plt.plot(inertia) plt.xlabel('# Cluster - K') plt.ylabel("Inertia") plt.savefig('graphs/graph_kmeans_inertia') plt.show() kmeans_clf = KMeans(n_clusters=2) kmeans_clf.fit(input_values_2d) contaminated = kmeans_clf.predict(input_values_2d) centroid = kmeans_clf.cluster_centers_ contaminated_predict = kmeans_clf.predict(predict_values_2d) print(contaminated_predict) spreadsheet = pd.read_csv('output_data/predict_data.csv') spreadsheet['kmeans_predict'] = contaminated_predict spreadsheet.to_csv('output_data/predict_data.csv', index=False)