def similarity_matrix(Corrected_Features,similarity): file = open(similarity, "w") writer=csv.writer(file, lineterminator=',') for root, dirs, files in os.walk(Corrected_Features): for j in files: if not j.startswith('.'): input_1=Corrected_Features+j ncol1=tools.file_col_coma(input_1)-1 data = np.loadtxt(input_1, delimiter=',', usecols=range(ncol1)) # ncol-1 because we skip the neuron manes X1 = data[:,:] y1 = data[:, 0].astype(np.int) # Labels (class) mtype1=[0 for x in xrange(ncol1-1)] f=[0 for x in xrange(ncol1-1)] for col in xrange(1,ncol1): mtype1[col-1]=np.mean(X1[:,col]) #mean of each feature for root, dirs, files in os.walk(Corrected_Features): for i in files: if not i.startswith('.'): input_2=Corrected_Features+i ncol2=tools.file_col_coma(input_2)-1 data = np.loadtxt(input_2, delimiter=',', usecols=range(ncol2)) # ncol-1 because we skip the neuron names X2 = data[:,:] y2 = data[:, 0].astype(np.int) # Labels (class) mtype2=[0 for x in xrange(ncol2-1)] for col in xrange(1,ncol2): mtype2[col-1]=np.mean(X2[:,col]) #mean of each feature for col in xrange(1,ncol2): f[col-1]=np.abs(mtype2[col-1]-mtype1[col-1]) similarity=np.mean(f) file.write("%f,"%similarity) file.write("\n") file.close()
def neuron_similarity_matrix_labelled(Preprocessed_file, similarity, Corrected_Features): lvltrace.lvltrace( "LVLEntree dans neuron_similarity_matrix_labelled data_preproc") file = open(similarity, "w") writer = csv.writer(file, lineterminator=',') file.write("mtype,") for root, dirs, files in os.walk(Corrected_Features): for v in files: if not v.startswith('.'): input = Corrected_Features + v ncol = tools.file_col_coma(input) - 1 data = np.loadtxt(input, delimiter=',', usecols=range( ncol)) # ncol-1 because we skip the neuron manes y = data[:, 0].astype(np.int) # Labels (class) file.write("%i," % np.mean(y)) file.write("\n") for root, dirs, files in os.walk(Corrected_Features): for j in files: if not j.startswith('.'): input_1 = Corrected_Features + j ncol1 = tools.file_col_coma(input_1) - 1 data = np.loadtxt(input_1, delimiter=',', usecols=range( ncol1)) # ncol-1 because we skip the neuron manes X1 = data[:, :] y1 = data[:, 0].astype(np.int) # Labels (class) mtype1 = [0 for x in xrange(ncol1 - 1)] f = [0 for x in xrange(ncol1 - 1)] label1 = np.mean(y1) for col in xrange(1, ncol1): mtype1[col - 1] = np.mean(X1[:, col]) #mean of each feature file.write("%i," % label1) for root, dirs, files in os.walk(Corrected_Features): for i in files: if not i.startswith('.'): input_2 = Corrected_Features + i ncol2 = tools.file_col_coma(input_2) - 1 data = np.loadtxt( input_2, delimiter=',', usecols=range(ncol2) ) # ncol-1 because we skip the neuron manes X2 = data[:, :] y2 = data[:, 0].astype(np.int) # Labels (class) mtype2 = [0 for x in xrange(ncol2 - 1)] for col in xrange(1, ncol2): mtype2[col - 1] = np.mean( X2[:, col]) #mean of each feature for col in xrange(1, ncol2): f[col - 1] = np.abs(mtype2[col - 1] - mtype1[col - 1]) similarity = np.mean(f) file.write("%f," % similarity) file.write("\n") file.close() lvltrace.lvltrace( "LVLSortie dans neuron_similarity_matrix_labelled data_preproc")
def neuron_similarity_matrix_labelled(Preprocessed_file,similarity,Corrected_Features): lvltrace.lvltrace("LVLEntree dans neuron_similarity_matrix_labelled data_preproc") file = open(similarity, "w") writer=csv.writer(file, lineterminator=',') file.write("mtype,") for root, dirs, files in os.walk(Corrected_Features): for v in files: if not v.startswith('.'): input=Corrected_Features+v ncol=tools.file_col_coma(input)-1 data = np.loadtxt(input, delimiter=',', usecols=range(ncol)) # ncol-1 because we skip the neuron manes y = data[:, 0].astype(np.int) # Labels (class) file.write("%i,"%np.mean(y)) file.write("\n") for root, dirs, files in os.walk(Corrected_Features): for j in files: if not j.startswith('.'): input_1=Corrected_Features+j ncol1=tools.file_col_coma(input_1)-1 data = np.loadtxt(input_1, delimiter=',', usecols=range(ncol1)) # ncol-1 because we skip the neuron manes X1 = data[:,:] y1 = data[:, 0].astype(np.int) # Labels (class) mtype1=[0 for x in xrange(ncol1-1)] f=[0 for x in xrange(ncol1-1)] label1=np.mean(y1) for col in xrange(1,ncol1): mtype1[col-1]=np.mean(X1[:,col]) #mean of each feature file.write("%i,"%label1) for root, dirs, files in os.walk(Corrected_Features): for i in files: if not i.startswith('.'): input_2=Corrected_Features+i ncol2=tools.file_col_coma(input_2)-1 data = np.loadtxt(input_2, delimiter=',', usecols=range(ncol2)) # ncol-1 because we skip the neuron manes X2 = data[:,:] y2 = data[:, 0].astype(np.int) # Labels (class) mtype2=[0 for x in xrange(ncol2-1)] for col in xrange(1,ncol2): mtype2[col-1]=np.mean(X2[:,col]) #mean of each feature for col in xrange(1,ncol2): f[col-1]=np.abs(mtype2[col-1]-mtype1[col-1]) similarity=np.mean(f) file.write("%f,"%similarity) file.write("\n") file.close() lvltrace.lvltrace("LVLSortie dans neuron_similarity_matrix_labelled data_preproc")
def randomforest(input_file,Output): lvltrace.lvltrace("LVLEntree dans randomforest") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape clf = RandomForestClassifier(n_estimators=10) clf.fit(X,y) y_pred = clf.predict(X) print "#########################################################################################################\n" print "The Random forest algo " print "classification accuracy:", metrics.accuracy_score(y, y_pred) print "precision:", metrics.precision_score(y, y_pred) print "recall:", metrics.recall_score(y, y_pred) print "f1 score:", metrics.f1_score(y, y_pred) print "\n" print "#########################################################################################################\n" results = Output+"Random_Forest_metrics.txt" file = open(results, "w") file.write("Random Forest Classifier estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y)): file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1))) file.close() title = "The Random forest" save = Output + "Random_Forest_confusion_matrix.png" plot_confusion_matrix(y, y_pred,title,save) lvltrace.lvltrace("LVLSortie dans randomforest")
def extratreeclassifier(input_file,Output): lvltrace.lvltrace("LVLEntree dans extratreeclassifier") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape clf = ExtraTreesClassifier(n_estimators=10) clf.fit(X,y) y_pred = clf.predict(X) print "#########################################################################################################\n" print " Extremely Randomized Trees " print "classification accuracy:", metrics.accuracy_score(y, y_pred) print "precision:", metrics.precision_score(y, y_pred) print "recall:", metrics.recall_score(y, y_pred) print "\n" print "#########################################################################################################\n" results = Output+"_Extremely_Random_Forest_metrics.txt" file = open(results, "w") file.write("Extremely Random Forest Classifier estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y)): file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1))) file.close() title = "Extremely Randomized Trees" save = Output + "Extremely_Randomized_Trees_confusion_matrix.png" plot_confusion_matrix(y, y_pred,title,save) lvltrace.lvltrace("LVLSortie dans extratreeclassifier")
def SVC_linear(input_file,Output): lvltrace.lvltrace("LVLEntree dans SVC_linear") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape clf=svm.SVC(kernel='linear') clf.fit(X,y) y_pred = clf.predict(X) print "#########################################################################################################\n" print "C-Support Vector Classifcation (with linear kernel) " print "classification accuracy:", metrics.accuracy_score(y, y_pred) print "precision:", metrics.precision_score(y, y_pred) print "recall:", metrics.recall_score(y, y_pred) print "f1 score:", metrics.f1_score(y, y_pred) print "\n" print "#########################################################################################################\n" results = Output+"SVM_Linear_Kernel_metrics.txt" file = open(results, "w") file.write("Support Vector Machine with Linear Kernel estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y)): file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1))) file.close() title = "SVC - linear Kernel" save = Output + "SVC_linear_confusion_matrix.png" plot_confusion_matrix(y, y_pred,title,save) lvltrace.lvltrace("LVLSortie dans SVC_linear")
def SVC_linear(input_file,Output,test_size): lvltrace.lvltrace("LVLEntree dans SVC_linear split_test") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) print X_train.shape, X_test.shape clf=svm.SVC(kernel='linear') clf.fit(X_train,y_train) y_pred = clf.predict(X_test) print "C-Support Vector Classifcation (with RBF linear) " print "y_test, y_pred, iteration" print "classification accuracy:", metrics.accuracy_score(y_test, y_pred) print "precision:", metrics.precision_score(y_test, y_pred) print "recall:", metrics.recall_score(y_test, y_pred) print "f1 score:", metrics.f1_score(y_test, y_pred) print "\n" results = Output+"SVM_Linear_Kernel_metrics_test.txt" file = open(results, "w") file.write("Support Vector Machine with Linear Kernel estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y_test)): file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1))) file.close() title = "SVC linear %f"%test_size save = Output + "SVC_linear_confusion_matrix"+"_%s.png"%test_size plot_confusion_matrix(y_test, y_pred,title,save) lvltrace.lvltrace("LVLsortie dans SVC_linear split_test")
def gaussianNB(input_file,Output,test_size): lvltrace.lvltrace("LVLEntree dans gaussianNB split_test") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) print X_train.shape, X_test.shape # Instantiate the estimator clf = GaussianNB() # Fit the estimator to the data clf.fit(X_train, y_train) # Use the model to predict the last several labels y_pred = clf.predict(X_test) print "Gaussian Naive Bayes estimator accuracy " print "classification accuracy:", metrics.accuracy_score(y_test, y_pred) print "precision:", metrics.precision_score(y_test, y_pred) print "recall:", metrics.recall_score(y_test, y_pred) print "f1 score:", metrics.f1_score(y_test, y_pred) results = Output+"GaussianNB_metrics_test.txt" file = open(results, "w") file.write("Gaussian Naive Bayes estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred)) file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y_test)): file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1))) file.close() title = "Gaussian Naive Bayes %f"%test_size save = Output + "Gaussian_NB_confusion_matrix"+"_%s.png"%test_size plot_confusion_matrix(y_test, y_pred,title,save) lvltrace.lvltrace("LVLsortie dans gaussianNB split_test")
def nearest_centroid(input_file,Output,test_size): ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) print X_train.shape, X_test.shape clf = NearestCentroid() clf.fit(X_train,y_train) y_pred = clf.predict(X_test) print "Nearest Centroid Classifier " print "classification accuracy:", metrics.accuracy_score(y_test, y_pred) print "precision:", metrics.precision_score(y_test, y_pred) print "recall:", metrics.recall_score(y_test, y_pred) print "f1 score:", metrics.f1_score(y_test, y_pred) print "\n" results = Output+"Nearest_Centroid_metrics_test.txt" file = open(results, "w") file.write("Nearest Centroid Classifier estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y_test)): file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1))) file.close() title = "Nearest Centroid %f"%test_size save = Output + "Nearest_Centroid_confusion_matrix"+"_%s.png"%test_size plot_confusion_matrix(y_test, y_pred,title,save) lvltrace.lvltrace("LVLSortie dans stochasticGD split_test")
def stochasticGD(input_file,Output): lvltrace.lvltrace("LVLEntree dans stochasticGD") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape clf = SGDClassifier(loss="hinge", penalty="l2") clf.fit(X,y) y_pred = clf.predict(X) print "#########################################################################################################\n" print "Stochastic Gradient Descent " print "classification accuracy:", metrics.accuracy_score(y, y_pred) print "precision:", metrics.precision_score(y, y_pred) print "recall:", metrics.recall_score(y, y_pred) print "f1 score:", metrics.f1_score(y, y_pred) print "\n" print "#########################################################################################################\n" results = Output+"Stochastic_GD_metrics.txt" file = open(results, "w") file.write("Stochastic Gradient Descent estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y)): file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1))) file.close() title = "Stochastic Gradient Descent" save = Output + "Stochastic_GD_confusion_matrix.png" plot_confusion_matrix(y, y_pred,title,save) lvltrace.lvltrace("LVLSortie dans stochasticGD")
def extratreeclassifier(input_file,Output,test_size): lvltrace.lvltrace("LVLEntree dans extratreeclassifier split_test") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) print X_train.shape, X_test.shape clf = ExtraTreesClassifier(n_estimators=10) clf.fit(X_train,y_train) y_pred = clf.predict(X_test) print "Extremely Randomized Trees" print "classification accuracy:", metrics.accuracy_score(y_test, y_pred) print "precision:", metrics.precision_score(y_test, y_pred) print "recall:", metrics.recall_score(y_test, y_pred) print "f1 score:", metrics.f1_score(y_test, y_pred) print "\n" results = Output+"_Extremely_Random_Forest_metrics_test.txt" file = open(results, "w") file.write("Extremely Random Forest Classifier estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y_test)): file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1))) file.close() title = "Extremely Randomized Trees %f"%test_size save = Output + "Extremely_Randomized_Trees_confusion_matrix"+"_%s.png"%test_size plot_confusion_matrix(y_test, y_pred,title,save) lvltrace.lvltrace("LVLSortie dans extratreeclassifier split_test")
def merger_labelled(Preprocessed_file, file_name, Corrected_Features): lvltrace.lvltrace("LVLEntree dans merger_labelled dabs data_preproc") # Merge all features files into one file_random = Corrected_Features + '/' + file_name ncol = tools.file_col_coma(file_random) data = np.loadtxt(file_random, delimiter=',', usecols=range(ncol - 1)) X = data[:, 1:] n_samples, n_features = X.shape fout = open(Preprocessed_file, "a") fout.write("Class,") for n in xrange(1, n_features + 1): fout.write("feature_%i," % n) fout.write("\n") # first file: first_file = Corrected_Features + file_name for line in open(first_file): fout.write(line) # now the rest: for root, dirs, files in os.walk(Corrected_Features): for i in files: if i != file_name: if not i.startswith('.'): f = open(Corrected_Features + i) f.next() #Skip the header for line in f: fout.write(line) f.close() fout.close() lvltrace.lvltrace("LVLSortie dans merger_labelled dans data_preproc")
def pca(input_file,Output): lvltrace.lvltrace("LVLEntree dans pca unsupervised") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape # instantiate the model model = PCA(n_components=2) # fit the model: notice we don't pass the labels! model.fit(X) # transform the data to two dimensions X_PCA = model.transform(X) print "#########################################################################################################\n" print "PCA" print "shape of result:", X_PCA.shape print model.explained_variance_ratio_ print "#########################################################################################################\n" results = Output+"pca.txt" file = open(results, "w") file.write("PCA\n") file.write("shape of result: %f,%f\n"%(X_PCA.shape[0],X_PCA.shape[1])) file.write("Explained variance ratio: %f,%f\n"%(model.explained_variance_ratio_[0],model.explained_variance_ratio_[1])) file.close() # plot the results along with the labels fig, ax = plt.subplots() im = ax.scatter(X_PCA[:, 0], X_PCA[:, 1], c=y) fig.colorbar(im); save = Output + "pca.png" plt.savefig(save) lvltrace.lvltrace("LVLSortie dans pca unsupervised")
def merger_labelled(Preprocessed_file,file_name,Corrected_Features): lvltrace.lvltrace("LVLEntree dans merger_labelled dabs data_preproc") # Merge all features files into one file_random=Corrected_Features+'/'+file_name ncol=tools.file_col_coma(file_random) data = np.loadtxt(file_random, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] n_samples, n_features = X.shape fout=open(Preprocessed_file,"a") fout.write("Class,") for n in xrange(1,n_features+1): fout.write("feature_%i,"%n) fout.write("\n") # first file: first_file=Corrected_Features+file_name for line in open(first_file): fout.write(line) # now the rest: for root, dirs, files in os.walk(Corrected_Features): for i in files: if i != file_name: if not i.startswith('.'): f=open(Corrected_Features+i) f.next() #Skip the header for line in f: fout.write(line) f.close() fout.close() lvltrace.lvltrace("LVLSortie dans merger_labelled dans data_preproc")
def kmeans(input_file, n_clusters, Output): lvltrace.lvltrace("LVLEntree dans kmeans unsupervised") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] sample_size, n_features = X.shape k_means=cluster.KMeans(init='k-means++', n_clusters=n_clusters, n_init=10) k_means.fit(X) reduced_data = k_means.transform(X) values = k_means.cluster_centers_.squeeze() labels = k_means.labels_ k_means_cluster_centers = k_means.cluster_centers_ print "#########################################################################################################\n" #print y #print labels print "K-MEANS\n" print('homogeneity_score: %f'%metrics.homogeneity_score(y, labels)) print('completeness_score: %f'%metrics.completeness_score(y, labels)) print('v_measure_score: %f'%metrics.v_measure_score(y, labels)) print('adjusted_rand_score: %f'%metrics.adjusted_rand_score(y, labels)) print('adjusted_mutual_info_score: %f'%metrics.adjusted_mutual_info_score(y, labels)) print('silhouette_score: %f'%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size)) print('\n') print "#########################################################################################################\n" results = Output+"kmeans_scores.txt" file = open(results, "w") file.write("K-Means Scores\n") file.write("Homogeneity Score: %f\n"%metrics.homogeneity_score(y, labels)) file.write("Completeness Score: %f\n"%metrics.completeness_score(y, labels)) file.write("V-Measure: %f\n"%metrics.v_measure_score(y, labels)) file.write("The adjusted Rand index: %f\n"%metrics.adjusted_rand_score(y, labels)) file.write("Adjusted Mutual Information: %f\n"%metrics.adjusted_mutual_info_score(y, labels)) file.write("Silhouette Score: %f\n"%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size)) file.write("\n") file.write("True Value, Cluster numbers, Iteration\n") for n in xrange(len(y)): file.write("%f, %f, %i\n"%(y[n],labels[n],(n+1))) file.close() import pylab as pl from itertools import cycle # plot the results along with the labels k_means_cluster_centers = k_means.cluster_centers_ fig, ax = plt.subplots() im=ax.scatter(X[:, 0], X[:, 1], c=labels, marker='.') for k in xrange(n_clusters): my_members = labels == k cluster_center = k_means_cluster_centers[k] ax.plot(cluster_center[0], cluster_center[1], 'w', color='b', marker='x', markersize=6) fig.colorbar(im) plt.title("Number of clusters: %i"%n_clusters) save = Output + "kmeans.png" plt.savefig(save) lvltrace.lvltrace("LVLsortie dans kmeans unsupervised")
def forest_of_trees(input_file, Output): import numpy as np from sklearn.datasets import make_classification from sklearn.ensemble import ExtraTreesClassifier lvltrace.lvltrace("LVLEntree dans forest_of_trees dans feature_selection") # Build a classification task using 3 informative features ncol = tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol - 1)) X = data[:, 1:] y = data[:, 0] #print X #print y sample_size, n_features = X.shape # Build a forest and compute the feature importances forest = ExtraTreesClassifier(n_estimators=250, random_state=0) forest.fit(X, y) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] results = Output + "forest_of_tree.txt" file = open(results, "w") file.write("Feature Ranking\n") # Print the feature ranking #print("Feature ranking:") for f in range(n_features): file.write("%d. feature %d (%f)\n" % (f + 1, indices[f] + 1, importances[indices[f]])) #print("%d. feature %d (%f)" % (f + 1, indices[f]+1, importances[indices[f]])) file.close() # Plot the feature importances of the forest import pylab as pl pl.figure() pl.title("Feature importances: Forest of trees applied to Layers + Types") pl.bar(range(n_features), importances[indices], color="r", yerr=std[indices], align="center") pl.xticks(range(n_features), indices + 1) pl.axis('tight') #pl.xlim([-1, 73]) save = Output + "forest_of_tree.png" pl.savefig(save) lvltrace.lvltrace("LVLSortie dans forest_of_trees dans feature_selection")
def similarity_matrix(Corrected_Features, similarity): file = open(similarity, "w") writer = csv.writer(file, lineterminator=',') for root, dirs, files in os.walk(Corrected_Features): for j in files: if not j.startswith('.'): input_1 = Corrected_Features + j ncol1 = tools.file_col_coma(input_1) - 1 data = np.loadtxt(input_1, delimiter=',', usecols=range( ncol1)) # ncol-1 because we skip the neuron manes X1 = data[:, :] y1 = data[:, 0].astype(np.int) # Labels (class) mtype1 = [0 for x in xrange(ncol1 - 1)] f = [0 for x in xrange(ncol1 - 1)] for col in xrange(1, ncol1): mtype1[col - 1] = np.mean(X1[:, col]) #mean of each feature for root, dirs, files in os.walk(Corrected_Features): for i in files: if not i.startswith('.'): input_2 = Corrected_Features + i ncol2 = tools.file_col_coma(input_2) - 1 data = np.loadtxt( input_2, delimiter=',', usecols=range(ncol2) ) # ncol-1 because we skip the neuron names X2 = data[:, :] y2 = data[:, 0].astype(np.int) # Labels (class) mtype2 = [0 for x in xrange(ncol2 - 1)] for col in xrange(1, ncol2): mtype2[col - 1] = np.mean( X2[:, col]) #mean of each feature for col in xrange(1, ncol2): f[col - 1] = np.abs(mtype2[col - 1] - mtype1[col - 1]) similarity = np.mean(f) file.write("%f," % similarity) file.write("\n") file.close()
def forest_of_trees(input_file,Output): import numpy as np from sklearn.datasets import make_classification from sklearn.ensemble import ExtraTreesClassifier lvltrace.lvltrace("LVLEntree dans forest_of_trees dans feature_selection") # Build a classification task using 3 informative features ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] #print X #print y sample_size, n_features = X.shape # Build a forest and compute the feature importances forest = ExtraTreesClassifier(n_estimators=250, random_state=0) forest.fit(X, y) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] results = Output+"forest_of_tree.txt" file = open(results, "w") file.write("Feature Ranking\n") # Print the feature ranking #print("Feature ranking:") for f in range(n_features): file.write("%d. feature %d (%f)\n" % (f + 1, indices[f]+1, importances[indices[f]])) #print("%d. feature %d (%f)" % (f + 1, indices[f]+1, importances[indices[f]])) file.close() # Plot the feature importances of the forest import pylab as pl pl.figure() pl.title("Feature importances: Forest of trees applied to Layers + Types") pl.bar(range(n_features), importances[indices], color="r", yerr=std[indices], align="center") pl.xticks(range(n_features), indices+1) pl.axis('tight') #pl.xlim([-1, 73]) save=Output+"forest_of_tree.png" pl.savefig(save) lvltrace.lvltrace("LVLSortie dans forest_of_trees dans feature_selection")
def gmm(input_file,Output): lvltrace.lvltrace("LVLEntree dans gmm unsupervised") print "#########################################################################################################\n" print "GMM" print "#########################################################################################################\n" ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape # Fit a mixture of gaussians with EM using five components gmm = mixture.GMM(n_components=5, covariance_type='spherical', init_params = 'wmc') gmm.fit(X) # Fit a dirichlet process mixture of gaussians using five components dpgmm = mixture.DPGMM(n_components=5, covariance_type='spherical',init_params = 'wmc') dpgmm.fit(X) color_iter = itertools.cycle(['r', 'g', 'b', 'c', 'm', 'b','g','r','c','m','y','k','b','g','r','c','m','y','k','b','g','r','c','m','y','k','b','g','r','c','m','y','k']) for i, (clf, title) in enumerate([(gmm, 'GMM'), (dpgmm, 'Dirichlet Process GMM')]): splot = pl.subplot(2, 1, 1 + i) Y_ = clf.predict(X) for i, (mean, covar, color) in enumerate(zip( clf.means_, clf._get_covars(), color_iter)): v, w = linalg.eigh(covar) u = w[0] / linalg.norm(w[0]) # as the DP will not use every component it has access to # unless it needs it, we shouldn't plot the redundant # components. if not np.any(Y_ == i): continue pl.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color) # Plot an ellipse to show the Gaussian component angle = np.arctan(u[1] / u[0]) angle = 180 * angle / np.pi # convert to degrees ell = mpl.patches.Ellipse(mean, v[0], v[1], 180 + angle, color=color) ell.set_clip_box(splot.bbox) ell.set_alpha(0.5) splot.add_artist(ell) pl.xticks(()) pl.yticks(()) pl.title(title) save = Output + "gmm.png" plt.savefig(save) lvltrace.lvltrace("LVLSortie dans gmm unsupervised")
def multinomialNB(input_file,Output): lvltrace.lvltrace("LVLEntree dans multinomialNB") try: ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape # Instantiate the estimator clf = MultinomialNB() # Fit the estimator to the data clf.fit(X, y) # Use the model to predict the last several labels y_pred = clf.predict(X) print "#########################################################################################################\n" print "Multinomial Naive Bayes estimator accuracy " print "classification accuracy:", metrics.accuracy_score(y, y_pred) print "precision:", metrics.precision_score(y, y_pred) print "recall:", metrics.recall_score(y, y_pred) print "f1 score:", metrics.f1_score(y, y_pred) print "\n" print "#########################################################################################################\n" results = Output+"Multinomial_NB_metrics.txt" file = open(results, "w") file.write("Multinomial Naive Bayes estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred)) file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y)): file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1))) file.close() title = "Multinomial Naive Bayes" save = Output + "Multinomial_NB_confusion_matrix.png" plot_confusion_matrix(y, y_pred,title,save) except (ValueError): if configuration.normalization == 'normalize': results = Output+"Multinomial_NB_metrics.txt" file = open(results, "w") file.write("In configuration.py file, normalization=normalize -- Input Values must be superior to 0\n") file.close() lvltrace.lvltrace("LVLSortie dans multinomialNB")
def lda(input_file,Output): lvltrace.lvltrace("LVLEntree dans lda") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape #lda=LDA(n_components=2) lda=LDA() lda.fit(X,y) X_LDA = lda.transform(X) y_pred = lda.predict(X) print "#########################################################################################################\n" print "Linear Discriminant Analysis Accuracy " print "classification accuracy:", metrics.accuracy_score(y, y_pred) print "precision:", metrics.precision_score(y, y_pred) print "recall:", metrics.recall_score(y, y_pred) print "f1 score:", metrics.f1_score(y, y_pred) print "\n" print "#########################################################################################################\n" results = Output+"LDA_metrics.txt" file = open(results, "w") file.write("Linear Discriminant Analaysis estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y)): file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1))) file.close() title = "LDA" save = Output + "LDA_confusion_matrix.png" plot_confusion_matrix(y, y_pred,title,save) # plot the results along with the labels fig, ax = plt.subplots() im = ax.scatter(X_LDA[:, 0], X_LDA[:, 1], c=y) fig.colorbar(im); save_lda = Output + "LDA_plot.png" plt.savefig(save_lda) plt.close() lvltrace.lvltrace("LVLSortie dans lda")
def lda(input_file,Output,test_size): lvltrace.lvltrace("LVLEntree dans lda split_test") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) print X_train.shape, X_test.shape lda=LDA(n_components=2) lda.fit(X_train,y_train) X_LDA = lda.transform(X_train) print "shape of result:", X_LDA.shape y_pred = lda.predict(X_test) print "Linear Discriminant Analysis Accuracy " print "classification accuracy:", metrics.accuracy_score(y_test, y_pred) print "precision:", metrics.precision_score(y_test, y_pred) print "recall:", metrics.recall_score(y_test, y_pred) print "f1 score:", metrics.f1_score(y_test, y_pred) #LVLprint "\n" results = Output+"LDA_metrics_test.txt" file = open(results, "w") file.write("Linear Discriminant Analaysis estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y_test)): file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1))) file.close() title = "LDA %f"%test_size save = Output + "LDA_confusion_matrix"+"_%s.png"%test_size plot_confusion_matrix(y_test, y_pred,title,save) # plot the results along with the labels fig, ax = plt.subplots() im = ax.scatter(X_LDA[:, 0], X_LDA[:, 1], c=y_train) fig.colorbar(im); save_lda = Output + "LDA_plot_test"+"_%s.png"%test_size plt.savefig(save_lda) plt.close() lvltrace.lvltrace("LVLSortie dans lda split_test")
def qda(input_file,Output): lvltrace.lvltrace("LVLEntree dans qda") try: ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape qda=QDA() qda.fit(X,y) y_pred = qda.predict(X) print "#########################################################################################################\n" print "Quadratic Discriminant Analysis Accuracy " print "classification accuracy:", metrics.accuracy_score(y, y_pred) print "precision:", metrics.precision_score(y, y_pred) print "recall:", metrics.recall_score(y, y_pred) print "f1 score:", metrics.f1_score(y, y_pred) print "\n" print "#########################################################################################################\n" results = Output+"QDA_metrics.txt" file = open(results, "w") file.write("Quadratic Discriminant Analaysis estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y)): file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1))) file.close() title = "QDA" save = Output + "QDA_confusion_matrix.png" plot_confusion_matrix(y, y_pred,title,save) except (AttributeError): if configuration.normalization == 'normalize': results = Output+"Multinomial_NB_metrics.txt" file = open(results, "w") file.write("In configuration.py file, normalization='normalize' -- Input Values must be superior to 0\n") file.close() lvltrace.lvltrace("LVLSortie dans qda")
def qda(input_file,Output,test_size): lvltrace.lvltrace("LVLEntree dans qda split_test") try: ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) print X_train.shape, X_test.shape lda=QDA() lda.fit(X_train,y_train) y_pred = lda.predict(X_test) print "Quadratic Discriminant Analysis Accuracy " print "classification accuracy:", metrics.accuracy_score(y_test, y_pred) print "precision:", metrics.precision_score(y_test, y_pred) print "recall:", metrics.recall_score(y_test, y_pred) print "f1 score:", metrics.f1_score(y_test, y_pred) #LVLprint "\n" results = Output+"QDA_metrics_test.txt" file = open(results, "w") file.write("Quadratic Discriminant Analaysis estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y_test)): file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1))) file.close() title = "QDA %f"%test_size save = Output + "QDA_confusion_matrix"+"_%s.png"%test_size plot_confusion_matrix(y_test, y_pred,title,save) except (AttributeError): if configuration.normalization == 'normalize': results = Output+"Multinomial_NB_metrics_test.txt" file = open(results, "w") file.write("In configuration.py file, normalization='normalize' -- Input Values must be superior to 0\n") file.close() lvltrace.lvltrace("LVLSortie dans qda split_test")
def Radius_Neighbors(input_file,Output,test_size): lvltrace.lvltrace("LVLEntree dans radius_kneighbors split_test") try: ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) print X_train.shape, X_test.shape clf = RadiusNeighborsClassifier(radius=0.001, weights='uniform', algorithm='auto') clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print "Radius Neighbors accuracy " print "classification accuracy:", metrics.accuracy_score(y_test, y_pred) print "precision:", metrics.precision_score(y_test, y_pred) print "recall:", metrics.recall_score(y_test, y_pred) print "f1 score:", metrics.f1_score(y_test, y_pred) print "\n" results = Output+"Raidus_Neighbors_metrics_test.txt" file = open(results, "w") file.write("Radius Neighbors estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y_test)): file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1))) file.close() title = "Radius Neighbors %f"%test_size save = Output + "Radius_Neighbors_confusion_matrix"+"_%s.png"%test_size plot_confusion_matrix(y_test, y_pred,title,save) except (ValueError): results = Output+"Raidus_Neighbors_metrics_test.txt" file = open(results, "w") file.write("In configuration.py file: No neighbors found for test samples, you can try using larger radius, give a label for outliers, consider or removing them from your dataset.") file.close() lvltrace.lvltrace("LVLSortie dans radius_kneighbors split_test")
def meanshift(input_file,Output): lvltrace.lvltrace("LVLEntree dans meanshift unsupervised") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] sample_size, n_features = X.shape # Compute clustering with MeanShift # The following bandwidth can be automatically detected using bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=sample_size) ms = MeanShift() ms.fit(X) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) print "#########################################################################################################\n" print "Mean Shift" print("number of estimated clusters : %d" % n_clusters_) #print labels #print y print('homogeneity_score: %f'%metrics.homogeneity_score(y, labels)) print('completeness_score: %f'%metrics.completeness_score(y, labels)) print('v_measure_score: %f'%metrics.v_measure_score(y, labels)) print('adjusted_rand_score: %f'%metrics.adjusted_rand_score(y, labels)) print('adjusted_mutual_info_score: %f'%metrics.adjusted_mutual_info_score(y, labels)) try: print('silhouette_score: %f'%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size)) except (ValueError): print "ValueError: Number of labels is 1 but should be more than 2and less than n_samples - 1" print "\n" print "#########################################################################################################\n" results = Output+"mean_shift_metrics.txt" file = open(results, "w") file.write("Mean Shift\n") file.write("Homogeneity Score: %f\n"%metrics.homogeneity_score(y, labels)) file.write("Completeness Score: %f\n"%metrics.completeness_score(y, labels)) file.write("V-Measure: %f\n"%metrics.v_measure_score(y, labels)) file.write("The adjusted Rand index: %f\n"%metrics.adjusted_rand_score(y, labels)) file.write("Adjusted Mutual Information: %f\n"%metrics.adjusted_mutual_info_score(y, labels)) try: file.write("Silhouette Score: %f\n"%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size)) except (ValueError): file.write("ValueError: Number of labels is 1 but should be more than 2and less than n_samples - 1") file.write("\n") file.write("True Value, Clusters, Iteration\n") for n in xrange(len(y)): file.write("%f,%f,%i\n"%(y[n],labels[n],(n+1))) file.close() # Plot result import pylab as pl from itertools import cycle fig, ax = plt.subplots() im=ax.scatter(X[:, 0], X[:, 1], c=labels, marker='.') for k in xrange(n_clusters_): my_members = labels == k cluster_center = cluster_centers[k] #print cluster_center[0], cluster_center[1] ax.plot(cluster_center[0], cluster_center[1], 'w', color='b', marker='x', markersize=6) fig.colorbar(im); plt.title('Estimated number of clusters: %d' % n_clusters_) save = Output + "mean_shift.png" plt.savefig(save) lvltrace.lvltrace("LVLSortie dans meanshift unsupervised")
def univariate(input_file, Output, percentile): ############################################################################### # import some data to play with lvltrace.lvltrace("LVLEntree dans univariate dans feature_selection") ncol = tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol - 1)) X = data[:, 1:] y = data[:, 0] sample_size, n_features = X.shape ############################################################################### pl.figure(1) pl.clf() X_indices = np.arange(X.shape[-1]) #print X_indices ############################################################################### # Univariate feature selection with F-test for feature scoring # We use the default selection function: the 10% most significant features selector = SelectPercentile(f_classif, percentile=percentile) selector.fit(X, y) scores = -np.log10(selector.pvalues_) scores /= scores.max() pl.bar(X_indices - .45, scores, width=.2, label=r'Univariate score ($-Log(p_{value})$)', color='g') ############################################################################### # Compare to the weights of an SVM clf = svm.SVC(kernel='linear') clf.fit(X, y) svm_weights = (clf.coef_**2).sum(axis=0) svm_weights /= svm_weights.max() pl.bar((X_indices + 1) - .25, svm_weights, width=.2, label='SVM weight', color='r') clf_selected = svm.SVC(kernel='linear') clf_selected.fit(selector.transform(X), y) svm_weights_selected = (clf_selected.coef_**2).sum(axis=0) svm_weights_selected /= svm_weights_selected.max() pl.bar(X_indices[selector.get_support()] - .05, svm_weights_selected, width=.2, label='SVM weights after selection', color='b') pl.title("Feature selection") pl.xlabel('Feature number') pl.yticks(()) pl.axis('tight') pl.legend(loc='upper right') save = Output + "univariate.png" pl.savefig(save) # Print the feature ranking results = Output + "univariate.txt" file = open(results, "w") file.write("Feature Ranking\n") #print len(X_indices[selector.get_support()]) for i in xrange(len(X_indices[selector.get_support()])): #print i #print (X_indices[selector.get_support()][i]+1) #print svm_weights_selected[i] file.write("%f,%f\n" % ((X_indices[selector.get_support()][i] + 1), svm_weights_selected[i])) file.close() #print("Feature ranking:") #print (X_indices[selector.get_support()] +1) #print svm_weights_selected lvltrace.lvltrace("LVLSortie dans univariate dans feature_selection")
def univariate(input_file, Output, percentile): ############################################################################### # import some data to play with lvltrace.lvltrace("LVLEntree dans univariate dans feature_selection") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] sample_size, n_features = X.shape ############################################################################### pl.figure(1) pl.clf() X_indices = np.arange(X.shape[-1]) #print X_indices ############################################################################### # Univariate feature selection with F-test for feature scoring # We use the default selection function: the 10% most significant features selector = SelectPercentile(f_classif, percentile=percentile) selector.fit(X, y) scores = -np.log10(selector.pvalues_) scores /= scores.max() pl.bar(X_indices - .45, scores, width=.2, label=r'Univariate score ($-Log(p_{value})$)', color='g') ############################################################################### # Compare to the weights of an SVM clf = svm.SVC(kernel='linear') clf.fit(X, y) svm_weights = (clf.coef_ ** 2).sum(axis=0) svm_weights /= svm_weights.max() pl.bar((X_indices+1) - .25, svm_weights, width=.2, label='SVM weight', color='r') clf_selected = svm.SVC(kernel='linear') clf_selected.fit(selector.transform(X), y) svm_weights_selected = (clf_selected.coef_ ** 2).sum(axis=0) svm_weights_selected /= svm_weights_selected.max() pl.bar(X_indices[selector.get_support()] - .05, svm_weights_selected, width=.2, label='SVM weights after selection', color='b') pl.title("Feature selection") pl.xlabel('Feature number') pl.yticks(()) pl.axis('tight') pl.legend(loc='upper right') save=Output+"univariate.png" pl.savefig(save) # Print the feature ranking results = Output+"univariate.txt" file = open(results, "w") file.write("Feature Ranking\n") #print len(X_indices[selector.get_support()]) for i in xrange(len(X_indices[selector.get_support()])): #print i #print (X_indices[selector.get_support()][i]+1) #print svm_weights_selected[i] file.write("%f,%f\n"%((X_indices[selector.get_support()][i]+1),svm_weights_selected[i])) file.close() #print("Feature ranking:") #print (X_indices[selector.get_support()] +1) #print svm_weights_selected lvltrace.lvltrace("LVLSortie dans univariate dans feature_selection")
def KMeans_PCA(input_file, n_clusters, Output): lvltrace.lvltrace("LVLEntree dans KMeans_PCA unsupervised") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] sample_size, n_features = X.shape reduced_data = PCA(n_components=2).fit_transform(X) k_means = KMeans(init='k-means++', n_clusters=n_clusters, n_init=50) k_means.fit(reduced_data) labels = k_means.labels_ print "#########################################################################################################\n" print "K-MEANS on PCA-reduced data" #print labels #print y print('homogeneity_score: %f'%metrics.homogeneity_score(y, labels)) print('completeness_score: %f'%metrics.completeness_score(y, labels)) print('v_measure_score: %f'%metrics.v_measure_score(y, labels)) print('adjusted_rand_score: %f'%metrics.adjusted_rand_score(y, labels)) print('adjusted_mutual_info_score: %f'%metrics.adjusted_mutual_info_score(y, labels)) print('silhouette_score: %f'%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size)) print "\n" print "#########################################################################################################\n" results = Output+"kmeans_PCA_metrics.txt" file = open(results, "w") file.write("K-Means clustering on the PCA-reduced data\n") file.write("Homogeneity Score: %f\n"%metrics.homogeneity_score(y, labels)) file.write("Completeness Score: %f\n"%metrics.completeness_score(y, labels)) file.write("V-Measure: %f\n"%metrics.v_measure_score(y, labels)) file.write("The adjusted Rand index: %f\n"%metrics.adjusted_rand_score(y, labels)) file.write("Adjusted Mutual Information: %f\n"%metrics.adjusted_mutual_info_score(y, labels)) file.write("Silhouette Score: %f\n"%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size)) file.write("\n") file.write("True Value, Clusters, Iteration\n") for n in xrange(len(y)): file.write("%f,%f,%i\n"%(y[n],labels[n],(n+1))) file.close() # Step size of the mesh. Decrease to increase the quality of the VQ. h = .02 # point in the mesh [x_min, m_max]x[y_min, y_max] # Plot the decision boundary. For that, we will assign a color to each x_min, x_max = reduced_data[:, 0].min() , reduced_data[:, 0].max() y_min, y_max = reduced_data[:, 1].min() , reduced_data[:, 1].max() xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # Obtain labels for each point in mesh. Use last trained model. Z = k_means.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) pl.figure(1) pl.clf() pl.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=pl.cm.Paired, aspect='auto', origin='lower') pl.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2) # Plot the centroids as a white X centroids = k_means.cluster_centers_ pl.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3, color='w', zorder=10) pl.title('K-means clustering on the PCA-reduced data\n' 'Number of clusters: %i'%n_clusters) pl.xlim(x_min, x_max) pl.ylim(y_min, y_max) pl.xticks(()) pl.yticks(()) save = Output + "kmeans_PCA.png" pl.savefig(save) lvltrace.lvltrace("LVLSortie dans KMeans_PCA unsupervised")
def affinitypropagation(input_file,type,pref,Output): lvltrace.lvltrace("LVLEntree dans affinitypropagation unsupervised") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] #print (" ici X vaut ") #print X #print (" fin de print X") labels_true = data[:,0] # A tester if type == 'spearmanr': X = scipy.stats.stats.spearmanr(X,axis=1)[0] else: if type == 'euclidean': X = -euclidean_distances(X, squared=True) else: print "something wrong" if pref == 'median': # A tester entre min ou median preference = np.median(X) else: if pref == 'mean': preference = np.mean(X) else: if pref == 'min': preference = np.min(X) else: print "something wrong" print "#########################################################################################################\n" print "Affinity Propagation" print preference n_samples, n_features = X.shape cluster_centers_indices, labels = affinity_propagation(X, preference=preference) #print cluster_centers_indices n_clusters_ = len(cluster_centers_indices) n_clusters_ = len(cluster_centers_indices) #print labels_true #print labels print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, metric='sqeuclidean')) print "\n" print "#########################################################################################################\n" results = Output+"affinity_propagation.txt" file = open(results, "w") file.write("Affinity Propagation\n") file.write("Homogeneity Score: %f\n"%metrics.homogeneity_score(labels_true, labels)) file.write("Completeness Score: %f\n"%metrics.completeness_score(labels_true, labels)) file.write("V-Measure: %f\n"%metrics.v_measure_score(labels_true, labels)) file.write("The adjusted Rand index: %f\n"%metrics.adjusted_rand_score(labels_true, labels)) file.write("Adjusted Mutual Information: %f\n"%metrics.adjusted_mutual_info_score(labels_true, labels)) file.write("Silhouette Score: %f\n"%metrics.silhouette_score(X, labels, metric='sqeuclidean')) file.write("\n") file.write("True Value, Clusters, Iteration\n") for n in xrange(len(labels_true)): file.write("%f,%f,%i\n"%(labels_true[n],labels[n],(n+1))) file.close() # Plot result import pylab as pl from itertools import cycle pl.close('all') pl.figure(1) pl.clf() colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbg') for k, col in zip(range(n_clusters_), colors): class_members = labels == k cluster_center = X[cluster_centers_indices[k]] pl.plot(X[class_members, 0], X[class_members, 1], col + '.') pl.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14) for x in X[class_members]: pl.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col) pl.title('Estimated number of clusters: %d' % n_clusters_) save = Output + "affinity_propagation.png" plt.savefig(save) lvltrace.lvltrace("LVLSortie dans affinitypropagation unsupervised")
def dbscan(input_file, Output): lvltrace.lvltrace("LVLEntree dans dbscan unsupervised") # Generate sample data ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] labels_true = data[:,0] #X = StandardScaler().fit_transform(Y) # Compute DBSCAN db = DBSCAN().fit(X) core_samples = db.core_sample_indices_ labels = db.labels_ print "#########################################################################################################\n" print "DBSCAN" print labels_true print labels # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels)) print "\n" print "#########################################################################################################\n" results = Output+"dbscan.txt" file = open(results, "w") file.write("DBSCAN\n") file.write("Homogeneity Score: %f\n"%metrics.homogeneity_score(y, labels)) file.write("Completeness Score: %f\n"%metrics.completeness_score(y, labels)) file.write("V-Measure: %f\n"%metrics.v_measure_score(y, labels)) file.write("The adjusted Rand index: %f\n"%metrics.adjusted_rand_score(y, labels)) file.write("Adjusted Mutual Information: %f\n"%metrics.adjusted_mutual_info_score(y, labels)) file.write("Silhouette Score: %f\n"%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size)) file.write("\n") file.write("True Value, Clusters, Iteration\n") for n in xrange(len(y)): file.write("%f,%f,%i\n"%(y[n],labels[n],(n+1))) file.close() # Plot result import pylab as pl # Black removed and is used for noise instead. unique_labels = set(labels) colors = pl.cm.Spectral(np.linspace(0, 1, len(unique_labels))) for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. col = 'k' markersize = 6 class_members = [index[0] for index in np.argwhere(labels == k)] cluster_core_samples = [index for index in core_samples if labels[index] == k] for index in class_members: x = X[index] if index in core_samples and k != -1: markersize = 14 else: markersize = 6 pl.plot(x[0], x[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=markersize) pl.title('Estimated number of clusters: %d' % n_clusters_) save = Output + "dbscan.png" plt.savefig(save) lvltrace.lvltrace("LVLSortie dans dbscan unsupervised")