def main(): #load normalized data xtrain1, xtest1, ytrain1, ytest1, xtrain2, xtest2, ytrain2, ytest2 = load_data() varianceratioplot(xtrain2,ytrain2,"LDA Cum Sum Variance dat2","figs/lda/varianceratiodat2.png") varianceratioplot(xtrain1,ytrain1,"LDA Cum Sum Variance dat1","figs/lda/varianceratiodat1.png") lda = LinearDiscriminantAnalysis() data = lda.fit_transform(xtrain2,ytrain2) vary_k(xtrain2,data, 20, ytrain2, "dat2") plt.clf() data = lda.fit_transform(xtrain1,ytrain1) vary_k(xtrain1,data, 50, ytrain1, "dat1", iters=2)
def main(): #load normalized data xtrain1, xtest1, ytrain1, ytest1, xtrain2, xtest2, ytrain2, ytest2 = load_data() # eigenratioplot(xtrain2, "Cum Sum Explained Variance Ratio per Component dat2", "figs/pca/explainedvar_dat2.png") # eigenratioplot(xtrain1, "Cum Sum Explained Variance Ratio per Component dat1", "figs/pca/explainedvar_dat1.png") # rec_err_plot(xtrain2,18, "Reconstruction Error dat 2", "figs/pca/recon_err_dat2.png") # rec_err_plot(xtrain1,54, "Reconstruction Error dat 1", "figs/pca/recon_err_dat1.png") pca = PCA(n_components=7) data=pca.fit_transform(xtrain2) vary_k(xtrain2,data, 20, ytrain2, "dat2") pca = PCA(n_components=44) data=pca.fit_transform(xtrain1) vary_k(xtrain1,data, 50, ytrain1, "dat1")
def main(): #load normalized data xtrain1, xtest1, ytrain1, ytest1, xtrain2, xtest2, ytrain2, ytest2 = load_data( ) kurt(xtrain1, 55, "ICA Mean Kurtosis vs num_components dat1", "figs/ica/kurtdat1.png") kurt(xtrain2, 20, "ICA Mean Kurtosis vs num_components dat2", "figs/ica/kurtdat2.png") rec_err_plot(xtrain2, 18, "Reconstruction Error dat 2", "figs/ica/recon_err_dat2.png") rec_err_plot(xtrain1, 55, "Reconstruction Error dat 1", "figs/ica/recon_err_dat1.png") ica = FastICA(n_components=3) data = ica.fit_transform(xtrain2) vary_k(xtrain2, data, 20, ytrain2, "dat2test") ica = FastICA(n_components=36) data = ica.fit_transform(xtrain1) vary_k(xtrain1, data, 50, ytrain1, "dat1")
def main(): xtrain1, xtest1, ytrain1, ytest1, xtrain2, xtest2, ytrain2, ytest2 = load_data( ) vary_k(xtrain2, 20, ytrain2, "dat2") vary_k(xtrain1, 50, ytrain1, "dat1") bic_model_selection(xtrain2, 20, "BIC per model dat2", "figs/em/bic_dat2.png", "BIC Score") bic_model_selection(xtrain2, 20, "AIC per model dat2", "figs/em/aic_dat2.png", "AIC Score") bic_model_selection(xtrain2, 20, "Average Log likelihood per model dat2", "figs/em/score_dat2.png", "Average Log Likelihood Score") bic_model_selection(xtrain1, 100, "BIC per model dat1", "figs/em/bic_dat1.png", "BIC Score") bic_model_selection(xtrain1, 100, "AIC per model dat1", "figs/em/aic_dat1.png", "AIC Score") bic_model_selection(xtrain1, 100, "Average Log likelihood per model dat1", "figs/em/score_dat1.png", "Average Log Likelihood Score")
def main(): xtrain1, xtest1, ytrain1, ytest1, xtrain2, xtest2, ytrain2, ytest2 = load_data( ) km = KMeans(4) visualizer = SilhouetteVisualizer(km, colors='yellowbrick') visualizer.fit(xtrain1) visualizer.show() ytest = km.fit_predict(xtrain1) print(metrics.homogeneity_score(ytrain1, ytest)) score(xtrain2, 20, ytrain2) elbowplot(xtrain2, 20, "distortion", "K Means Clustering Distortion vs Number of Clusters dat2", "figs/kmeans/kmeans_elbow_dat2.png") elbowplot(xtrain1, 100, "distortion", "K Means Clustering Distortion vs Number of Clusters dat1", "figs/kmeans/kmeans_elbow_dat1.png") elbowplot(xtrain2, 40, "silhouette", "K Means Clustering Silhouette Score vs Number of Clusters dat2", "figs/kmeans/kmeans_silhouette_dat2.png", elbow=False) elbowplot(xtrain1, 100, "silhouette", "K Means Clustering Silhouette Score vs Number of Clusters dat1", "figs/kmeans/kmeans_silhouette_dat1.png", elbow=False) elbowplot( xtrain2, 20, "calinski_harabasz", "K Means Clustering Calinski Harabasz Score vs Number of Clusters dat2", "figs/kmeans/kmeans_calinski_dat2.png", elbow=False) elbowplot( xtrain1, 100, "calinski_harabasz", "K Means Clustering Calinski Harabasz Score vs Number of Clusters dat1", "figs/kmeans/kmeans_calinski_dat1.png", elbow=False)
def cluster_nn(): out = "csv output/" xtrain1, xtest1, ytrain1, ytest1, xtrain2, xtest2, ytrain2, ytest2 = load_data( test_size=0.05) nn2 = MLPClassifier(activation='relu', alpha=0.001, hidden_layer_sizes=(140, ), learning_rate_init=0.0033333366666666664) km = KMeans() pipe = Pipeline(steps=[('km', km), ('neuralnet', nn2)]) grid = {'km__n_clusters': np.arange(1, 20, 1)} gs = GridSearchCV(pipe, grid, return_train_score=True, verbose=10, cv=5) gs.fit(xtrain2, ytrain2) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'kmnndat2.csv') nn2 = MLPClassifier(activation='relu', alpha=0.001, hidden_layer_sizes=(140, ), learning_rate_init=0.0033333366666666664) em = myGMM() pipe = Pipeline(steps=[('em', em), ('neuralnet', nn2)]) grid = {'em__n_components': np.arange(1, 20, 1)} gs = GridSearchCV(pipe, grid, return_train_score=True, verbose=10, cv=5) gs.fit(xtrain2, ytrain2) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'emnndat2.csv') nn2 = MLPClassifier(activation='relu', alpha=0.001, hidden_layer_sizes=(140, ), learning_rate_init=0.0033333366666666664) em = myGMMstack() pipe = Pipeline(steps=[('em', em), ('neuralnet', nn2)]) grid = {'em__n_components': np.arange(1, 20, 1)} gs = GridSearchCV(pipe, grid, return_train_score=True, verbose=10, cv=5) gs.fit(xtrain2, ytrain2) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'emstacknndat2.csv') kmeanstack()
def main(): #load normalized data xtrain1, xtest1, ytrain1, ytest1, xtrain2, xtest2, ytrain2, ytest2 = load_data( ) #dat1: 36, dat2: 3 rec_err_plot(xtrain2, 18, "Random Projection Reconstruction Error dat2", "figs/rca/recondat2.png") rec_err_plot(xtrain1, 54, "Random Projection Reconstruction Error dat1", "figs/rca/recondat1.png") sil_plots(xtrain2, 18, 4, "dat2") sil_plots(xtrain1, 54, 44, "dat1") """0.03288655685198534 0.017619964663429535 0.007523731695415414 0.005271076420350498""" transformer = GaussianRandomProjection(n_components=14) data = transformer.fit_transform(xtrain2) vary_k(xtrain2, data, 20, ytrain2, "dat2") transformer = GaussianRandomProjection(n_components=44) data = transformer.fit_transform(xtrain1) vary_k(xtrain1, data, 50, ytrain1, "dat1")
def kmeanstack(): frame = np.ones((5, 20)) times = np.ones((5, 20)) for j in range(5): xtrain1, xtest1, ytrain1, ytest1, xtrain2, xtest2, ytrain2, ytest2 = load_data( test_size=0.2) for i in range(1, 20, 1): km = KMeans(n_clusters=i) xnew = np.hstack((xtrain2, km.fit_transform(xtrain2))) xtestnew = np.hstack((xtest2, km.transform(xtest2))) nn2 = MLPClassifier(activation='relu', alpha=0.001, hidden_layer_sizes=(140, ), learning_rate_init=0.0033333366666666664) start = time.time() nn2.fit(xnew, ytrain2) fittime = time.time() - start times[j][i - 1] = fittime frame[j][i - 1] = nn2.score(xtestnew, ytest2) np.savetxt("nnkmstack.csv", frame, delimiter=",") np.savetxt("nnkmstacktimes.csv", times, delimiter=",")
import datetime from app import save_data, load_data save_data("新宿", "渋谷", "テスト", datetime.datetime(2020, 10, 31, 0)) print(load_data())
def setUp(self): importlib.reload(app) documents, directories = app.load_data() self.docs = documents self.dirs = directories self.commands = app.commands
import maps import unidecode as und import csv import multicampi import pandas as pd import app def obter_nome_correto(nome_errado, localidades): for localidade in localidades.keys(): if localidade in und.unidecode(str.lower(nome_errado)): return localidade print(nome_errado) return nome_errado localidades = maps.load_cities_coordinates('data/localidades.csv') df = app.load_data('data/dados_pesquisa.csv', 0) df['cidade'] = df['cidade'].apply(lambda c: obter_nome_correto(c, localidades)) df.to_csv('cidades_normalizadas3.csv', sep=';')
def reduction_nn(): out = "csv output/" xtrain1, xtest1, ytrain1, ytest1, xtrain2, xtest2, ytrain2, ytest2 = load_data( test_size=0.05) nn2 = MLPClassifier(activation='relu', alpha=0.001, hidden_layer_sizes=(140, ), learning_rate_init=0.0033333366666666664) gs = GridSearchCV(nn2, {}, return_train_score=True, verbose=10, cv=5) gs.fit(xtrain2, ytrain2) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'dat2.csv') nn2 = MLPClassifier(activation='relu', alpha=0.001, hidden_layer_sizes=(140, ), learning_rate_init=0.0033333366666666664) pca = PCA() pipe = Pipeline(steps=[('pca', pca), ('neuralnet', nn2)]) grid = {'pca__n_components': np.arange(1, 19, 1)} gs = GridSearchCV(pipe, grid, return_train_score=True, verbose=2, cv=5) gs.fit(xtrain2, ytrain2) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'pcadat2.csv') nn2 = MLPClassifier(activation='relu', alpha=0.001, hidden_layer_sizes=(140, ), learning_rate_init=0.0033333366666666664) ica = FastICA() pipe = Pipeline(steps=[('ica', ica), ('neuralnet', nn2)]) grid = {'ica__n_components': np.arange(1, 19, 1)} gs = GridSearchCV(pipe, grid, return_train_score=True, verbose=10, cv=5) gs.fit(xtrain2, ytrain2) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'icadat2.csv') nn2 = MLPClassifier(activation='relu', alpha=0.001, hidden_layer_sizes=(140, ), learning_rate_init=0.0033333366666666664) rca = GaussianRandomProjection() pipe = Pipeline(steps=[('rca', rca), ('neuralnet', nn2)]) grid = {'rca__n_components': np.arange(1, 19, 1)} gs = GridSearchCV(pipe, grid, return_train_score=True, verbose=10, cv=5) gs.fit(xtrain2, ytrain2) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'rcadat2.csv') nn2 = MLPClassifier(activation='relu', alpha=0.001, hidden_layer_sizes=(140, ), learning_rate_init=0.0033333366666666664) lda = LinearDiscriminantAnalysis() pipe = Pipeline(steps=[('lda', lda), ('neuralnet', nn2)]) grid = {'lda__n_components': np.arange(1, 4, 1)} gs = GridSearchCV(pipe, grid, return_train_score=True, verbose=10, cv=5) gs.fit(xtrain2, ytrain2) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'ldadat2.csv')