def similarity_matrix(Corrected_Features,similarity):
    file = open(similarity, "w")
    writer=csv.writer(file, lineterminator=',')
    for root, dirs, files in os.walk(Corrected_Features):
        for j in files:
            if not j.startswith('.'):
                input_1=Corrected_Features+j
                ncol1=tools.file_col_coma(input_1)-1
                data = np.loadtxt(input_1, delimiter=',', usecols=range(ncol1)) # ncol-1 because we skip the neuron manes
                X1 = data[:,:]
                y1 = data[:, 0].astype(np.int) # Labels (class)
                mtype1=[0 for x in xrange(ncol1-1)]
                f=[0 for x in xrange(ncol1-1)]
                for col in xrange(1,ncol1):
                    mtype1[col-1]=np.mean(X1[:,col]) #mean of each feature
                for root, dirs, files in os.walk(Corrected_Features):
                    for i in files:
                        if not i.startswith('.'):
                            input_2=Corrected_Features+i
                            ncol2=tools.file_col_coma(input_2)-1
                            data = np.loadtxt(input_2, delimiter=',', usecols=range(ncol2)) # ncol-1 because we skip the neuron names
                            X2 = data[:,:]
                            y2 = data[:, 0].astype(np.int) # Labels (class)
                            mtype2=[0 for x in xrange(ncol2-1)]
                            for col in xrange(1,ncol2):
                                mtype2[col-1]=np.mean(X2[:,col]) #mean of each feature
                            for col in xrange(1,ncol2):
                                f[col-1]=np.abs(mtype2[col-1]-mtype1[col-1])
                            similarity=np.mean(f)
                            file.write("%f,"%similarity)
                file.write("\n")
    file.close()
def neuron_similarity_matrix_labelled(Preprocessed_file, similarity,
                                      Corrected_Features):
    lvltrace.lvltrace(
        "LVLEntree dans neuron_similarity_matrix_labelled data_preproc")
    file = open(similarity, "w")
    writer = csv.writer(file, lineterminator=',')
    file.write("mtype,")
    for root, dirs, files in os.walk(Corrected_Features):
        for v in files:
            if not v.startswith('.'):
                input = Corrected_Features + v
                ncol = tools.file_col_coma(input) - 1
                data = np.loadtxt(input, delimiter=',', usecols=range(
                    ncol))  # ncol-1 because we skip the neuron manes
                y = data[:, 0].astype(np.int)  # Labels (class)
                file.write("%i," % np.mean(y))
        file.write("\n")
    for root, dirs, files in os.walk(Corrected_Features):
        for j in files:
            if not j.startswith('.'):
                input_1 = Corrected_Features + j
                ncol1 = tools.file_col_coma(input_1) - 1
                data = np.loadtxt(input_1, delimiter=',', usecols=range(
                    ncol1))  # ncol-1 because we skip the neuron manes
                X1 = data[:, :]
                y1 = data[:, 0].astype(np.int)  # Labels (class)
                mtype1 = [0 for x in xrange(ncol1 - 1)]
                f = [0 for x in xrange(ncol1 - 1)]
                label1 = np.mean(y1)
                for col in xrange(1, ncol1):
                    mtype1[col - 1] = np.mean(X1[:,
                                                 col])  #mean of each feature
                file.write("%i," % label1)
                for root, dirs, files in os.walk(Corrected_Features):
                    for i in files:
                        if not i.startswith('.'):
                            input_2 = Corrected_Features + i
                            ncol2 = tools.file_col_coma(input_2) - 1
                            data = np.loadtxt(
                                input_2, delimiter=',', usecols=range(ncol2)
                            )  # ncol-1 because we skip the neuron manes
                            X2 = data[:, :]
                            y2 = data[:, 0].astype(np.int)  # Labels (class)
                            mtype2 = [0 for x in xrange(ncol2 - 1)]

                            for col in xrange(1, ncol2):
                                mtype2[col - 1] = np.mean(
                                    X2[:, col])  #mean of each feature

                            for col in xrange(1, ncol2):
                                f[col - 1] = np.abs(mtype2[col - 1] -
                                                    mtype1[col - 1])
                            similarity = np.mean(f)
                            file.write("%f," % similarity)
                file.write("\n")
    file.close()
    lvltrace.lvltrace(
        "LVLSortie dans neuron_similarity_matrix_labelled data_preproc")
def neuron_similarity_matrix_labelled(Preprocessed_file,similarity,Corrected_Features):
    lvltrace.lvltrace("LVLEntree dans neuron_similarity_matrix_labelled data_preproc")
    file = open(similarity, "w")
    writer=csv.writer(file, lineterminator=',')
    file.write("mtype,")
    for root, dirs, files in os.walk(Corrected_Features):
        for v in files:
            if not v.startswith('.'):
                input=Corrected_Features+v
                ncol=tools.file_col_coma(input)-1
                data = np.loadtxt(input, delimiter=',', usecols=range(ncol)) # ncol-1 because we skip the neuron manes
                y = data[:, 0].astype(np.int) # Labels (class)
                file.write("%i,"%np.mean(y))
        file.write("\n")
    for root, dirs, files in os.walk(Corrected_Features):
        for j in files:
            if not j.startswith('.'):
                input_1=Corrected_Features+j
                ncol1=tools.file_col_coma(input_1)-1
                data = np.loadtxt(input_1, delimiter=',', usecols=range(ncol1)) # ncol-1 because we skip the neuron manes
                X1 = data[:,:]
                y1 = data[:, 0].astype(np.int) # Labels (class)
                mtype1=[0 for x in xrange(ncol1-1)]
                f=[0 for x in xrange(ncol1-1)]
                label1=np.mean(y1)
                for col in xrange(1,ncol1):
                    mtype1[col-1]=np.mean(X1[:,col]) #mean of each feature
                file.write("%i,"%label1)
                for root, dirs, files in os.walk(Corrected_Features):
                    for i in files:
                        if not i.startswith('.'):
                            input_2=Corrected_Features+i
                            ncol2=tools.file_col_coma(input_2)-1
                            data = np.loadtxt(input_2, delimiter=',', usecols=range(ncol2)) # ncol-1 because we skip the neuron manes
                            X2 = data[:,:]
                            y2 = data[:, 0].astype(np.int) # Labels (class)
                            mtype2=[0 for x in xrange(ncol2-1)]
                            
                            for col in xrange(1,ncol2):
                                mtype2[col-1]=np.mean(X2[:,col]) #mean of each feature
                            
                            for col in xrange(1,ncol2):
                                f[col-1]=np.abs(mtype2[col-1]-mtype1[col-1])
                            similarity=np.mean(f)
                            file.write("%f,"%similarity)
                file.write("\n")
    file.close()
    lvltrace.lvltrace("LVLSortie dans neuron_similarity_matrix_labelled data_preproc")
def randomforest(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans randomforest")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    clf = RandomForestClassifier(n_estimators=10)
    clf.fit(X,y)
    y_pred = clf.predict(X)
    print "#########################################################################################################\n"
    print "The Random forest algo "
    print "classification accuracy:", metrics.accuracy_score(y, y_pred)
    print "precision:", metrics.precision_score(y, y_pred)
    print "recall:", metrics.recall_score(y, y_pred)
    print "f1 score:", metrics.f1_score(y, y_pred)
    print "\n"
    print "#########################################################################################################\n"
    results = Output+"Random_Forest_metrics.txt"
    file = open(results, "w")
    file.write("Random Forest Classifier estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1)))
    file.close()
    title = "The Random forest"
    save = Output + "Random_Forest_confusion_matrix.png"
    plot_confusion_matrix(y, y_pred,title,save)
    lvltrace.lvltrace("LVLSortie dans randomforest")
def extratreeclassifier(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans extratreeclassifier")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    clf = ExtraTreesClassifier(n_estimators=10)
    clf.fit(X,y)
    y_pred = clf.predict(X)
    print "#########################################################################################################\n"
    print " Extremely Randomized Trees "
    print "classification accuracy:", metrics.accuracy_score(y, y_pred)
    print "precision:", metrics.precision_score(y, y_pred)
    print "recall:", metrics.recall_score(y, y_pred)
    print "\n"
    print "#########################################################################################################\n"
    results = Output+"_Extremely_Random_Forest_metrics.txt"
    file = open(results, "w")
    file.write("Extremely Random Forest Classifier estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1)))
    file.close()
    title = "Extremely Randomized Trees"
    save = Output + "Extremely_Randomized_Trees_confusion_matrix.png"
    plot_confusion_matrix(y, y_pred,title,save)
    lvltrace.lvltrace("LVLSortie dans extratreeclassifier")
def SVC_linear(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans SVC_linear")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    clf=svm.SVC(kernel='linear')
    clf.fit(X,y)
    y_pred = clf.predict(X)
    print "#########################################################################################################\n"
    print "C-Support Vector Classifcation (with linear kernel) "
    print "classification accuracy:", metrics.accuracy_score(y, y_pred)
    print "precision:", metrics.precision_score(y, y_pred)
    print "recall:", metrics.recall_score(y, y_pred)
    print "f1 score:", metrics.f1_score(y, y_pred)
    print "\n"
    print "#########################################################################################################\n"
    results = Output+"SVM_Linear_Kernel_metrics.txt"
    file = open(results, "w")
    file.write("Support Vector Machine with Linear Kernel estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1)))
    file.close()
    title = "SVC - linear Kernel"
    save = Output + "SVC_linear_confusion_matrix.png"
    plot_confusion_matrix(y, y_pred,title,save)
    lvltrace.lvltrace("LVLSortie dans SVC_linear")
def SVC_linear(input_file,Output,test_size):
    lvltrace.lvltrace("LVLEntree dans SVC_linear split_test")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    print X_train.shape, X_test.shape
    clf=svm.SVC(kernel='linear')
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print "C-Support Vector Classifcation (with RBF linear) "
    print "y_test, y_pred, iteration"
    print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
    print "precision:", metrics.precision_score(y_test, y_pred)
    print "recall:", metrics.recall_score(y_test, y_pred)
    print "f1 score:", metrics.f1_score(y_test, y_pred)
    print "\n"
    results = Output+"SVM_Linear_Kernel_metrics_test.txt"
    file = open(results, "w")
    file.write("Support Vector Machine with Linear Kernel estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y_test)):
        file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
    file.close()
    title = "SVC linear %f"%test_size
    save = Output + "SVC_linear_confusion_matrix"+"_%s.png"%test_size
    plot_confusion_matrix(y_test, y_pred,title,save)
    lvltrace.lvltrace("LVLsortie dans SVC_linear split_test")
def gaussianNB(input_file,Output,test_size):
    lvltrace.lvltrace("LVLEntree dans gaussianNB split_test")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    print X_train.shape, X_test.shape
    # Instantiate the estimator
    clf = GaussianNB()
    # Fit the estimator to the data
    clf.fit(X_train, y_train)
    # Use the model to predict the last several labels
    y_pred = clf.predict(X_test)
    print "Gaussian Naive Bayes estimator accuracy "
    print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
    print "precision:", metrics.precision_score(y_test, y_pred)
    print "recall:", metrics.recall_score(y_test, y_pred)
    print "f1 score:", metrics.f1_score(y_test, y_pred)
    results = Output+"GaussianNB_metrics_test.txt"
    file = open(results, "w")
    file.write("Gaussian Naive Bayes estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y_test)):
        file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
    file.close()
    title = "Gaussian Naive Bayes %f"%test_size
    save = Output + "Gaussian_NB_confusion_matrix"+"_%s.png"%test_size
    plot_confusion_matrix(y_test, y_pred,title,save)
    lvltrace.lvltrace("LVLsortie dans gaussianNB split_test")
def nearest_centroid(input_file,Output,test_size):
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    print X_train.shape, X_test.shape
    clf = NearestCentroid()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print "Nearest Centroid Classifier "
    print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
    print "precision:", metrics.precision_score(y_test, y_pred)
    print "recall:", metrics.recall_score(y_test, y_pred)
    print "f1 score:", metrics.f1_score(y_test, y_pred)
    print "\n"
    results = Output+"Nearest_Centroid_metrics_test.txt"
    file = open(results, "w")
    file.write("Nearest Centroid Classifier estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y_test)):
        file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
    file.close()
    title = "Nearest Centroid %f"%test_size
    save = Output + "Nearest_Centroid_confusion_matrix"+"_%s.png"%test_size
    plot_confusion_matrix(y_test, y_pred,title,save)
    lvltrace.lvltrace("LVLSortie dans stochasticGD split_test")
def stochasticGD(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans stochasticGD")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    clf = SGDClassifier(loss="hinge", penalty="l2")
    clf.fit(X,y)
    y_pred = clf.predict(X)
    print "#########################################################################################################\n"
    print "Stochastic Gradient Descent "
    print "classification accuracy:", metrics.accuracy_score(y, y_pred)
    print "precision:", metrics.precision_score(y, y_pred)
    print "recall:", metrics.recall_score(y, y_pred)
    print "f1 score:", metrics.f1_score(y, y_pred)
    print "\n"
    print "#########################################################################################################\n"
    results = Output+"Stochastic_GD_metrics.txt"
    file = open(results, "w")
    file.write("Stochastic Gradient Descent estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1)))
    file.close()
    title = "Stochastic Gradient Descent"
    save = Output + "Stochastic_GD_confusion_matrix.png"
    plot_confusion_matrix(y, y_pred,title,save)
    lvltrace.lvltrace("LVLSortie dans stochasticGD")
def extratreeclassifier(input_file,Output,test_size):
    lvltrace.lvltrace("LVLEntree dans extratreeclassifier split_test")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    print X_train.shape, X_test.shape
    clf = ExtraTreesClassifier(n_estimators=10)
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print "Extremely Randomized Trees"
    print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
    print "precision:", metrics.precision_score(y_test, y_pred)
    print "recall:", metrics.recall_score(y_test, y_pred)
    print "f1 score:", metrics.f1_score(y_test, y_pred)
    print "\n"
    results = Output+"_Extremely_Random_Forest_metrics_test.txt"
    file = open(results, "w")
    file.write("Extremely Random Forest Classifier estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y_test)):
        file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
    file.close()
    title = "Extremely Randomized Trees %f"%test_size
    save = Output + "Extremely_Randomized_Trees_confusion_matrix"+"_%s.png"%test_size
    plot_confusion_matrix(y_test, y_pred,title,save)
    lvltrace.lvltrace("LVLSortie dans extratreeclassifier split_test")
def merger_labelled(Preprocessed_file, file_name, Corrected_Features):
    lvltrace.lvltrace("LVLEntree dans merger_labelled dabs data_preproc")
    # Merge all features files into one
    file_random = Corrected_Features + '/' + file_name
    ncol = tools.file_col_coma(file_random)
    data = np.loadtxt(file_random, delimiter=',', usecols=range(ncol - 1))
    X = data[:, 1:]
    n_samples, n_features = X.shape
    fout = open(Preprocessed_file, "a")
    fout.write("Class,")
    for n in xrange(1, n_features + 1):
        fout.write("feature_%i," % n)
    fout.write("\n")
    # first file:
    first_file = Corrected_Features + file_name
    for line in open(first_file):
        fout.write(line)
    # now the rest:
    for root, dirs, files in os.walk(Corrected_Features):
        for i in files:
            if i != file_name:
                if not i.startswith('.'):
                    f = open(Corrected_Features + i)
                    f.next()  #Skip the header
                    for line in f:
                        fout.write(line)
                    f.close()
    fout.close()
    lvltrace.lvltrace("LVLSortie dans merger_labelled dans data_preproc")
Пример #13
0
def pca(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans pca unsupervised")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    # instantiate the model
    model = PCA(n_components=2)
    # fit the model: notice we don't pass the labels!
    model.fit(X)
    # transform the data to two dimensions
    X_PCA = model.transform(X)
    print "#########################################################################################################\n"
    print "PCA"
    print "shape of result:", X_PCA.shape
    print model.explained_variance_ratio_
    print "#########################################################################################################\n"
    
    results = Output+"pca.txt"
    file = open(results, "w")
    file.write("PCA\n")
    file.write("shape of result: %f,%f\n"%(X_PCA.shape[0],X_PCA.shape[1]))
    file.write("Explained variance ratio: %f,%f\n"%(model.explained_variance_ratio_[0],model.explained_variance_ratio_[1]))
    file.close()
    
    # plot the results along with the labels
    fig, ax = plt.subplots()
    im = ax.scatter(X_PCA[:, 0], X_PCA[:, 1], c=y)
    fig.colorbar(im);
    save = Output + "pca.png"
    plt.savefig(save)
    lvltrace.lvltrace("LVLSortie dans pca unsupervised")
def merger_labelled(Preprocessed_file,file_name,Corrected_Features):
    lvltrace.lvltrace("LVLEntree dans merger_labelled dabs data_preproc")
    # Merge all features files into one
    file_random=Corrected_Features+'/'+file_name
    ncol=tools.file_col_coma(file_random)
    data = np.loadtxt(file_random, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    n_samples, n_features = X.shape
    fout=open(Preprocessed_file,"a")
    fout.write("Class,")
    for n in xrange(1,n_features+1):
        fout.write("feature_%i,"%n)
    fout.write("\n")
    # first file:
    first_file=Corrected_Features+file_name
    for line in open(first_file):
        fout.write(line)
    # now the rest:
    for root, dirs, files in os.walk(Corrected_Features):
        for i in files:
            if i != file_name:
                if not i.startswith('.'):
                    f=open(Corrected_Features+i)
                    f.next() #Skip the header
                    for line in f:
                        fout.write(line)
                    f.close()
    fout.close()
    lvltrace.lvltrace("LVLSortie dans merger_labelled dans data_preproc")
Пример #15
0
def kmeans(input_file, n_clusters, Output):
    lvltrace.lvltrace("LVLEntree dans kmeans unsupervised")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    sample_size, n_features = X.shape
    k_means=cluster.KMeans(init='k-means++', n_clusters=n_clusters, n_init=10)
    k_means.fit(X)
    reduced_data = k_means.transform(X)
    values = k_means.cluster_centers_.squeeze()
    labels = k_means.labels_
    k_means_cluster_centers = k_means.cluster_centers_
    print "#########################################################################################################\n"
    #print y
    #print labels
    print "K-MEANS\n"
    print('homogeneity_score: %f'%metrics.homogeneity_score(y, labels))
    print('completeness_score: %f'%metrics.completeness_score(y, labels))
    print('v_measure_score: %f'%metrics.v_measure_score(y, labels))
    print('adjusted_rand_score: %f'%metrics.adjusted_rand_score(y, labels))
    print('adjusted_mutual_info_score: %f'%metrics.adjusted_mutual_info_score(y,  labels))
    print('silhouette_score: %f'%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size))
    print('\n')
    print "#########################################################################################################\n"
    results = Output+"kmeans_scores.txt"
    file = open(results, "w")
    file.write("K-Means Scores\n")
    file.write("Homogeneity Score: %f\n"%metrics.homogeneity_score(y, labels))
    file.write("Completeness Score: %f\n"%metrics.completeness_score(y, labels))
    file.write("V-Measure: %f\n"%metrics.v_measure_score(y, labels))
    file.write("The adjusted Rand index: %f\n"%metrics.adjusted_rand_score(y, labels))
    file.write("Adjusted Mutual Information: %f\n"%metrics.adjusted_mutual_info_score(y,  labels))
    file.write("Silhouette Score: %f\n"%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size))
    file.write("\n")
    file.write("True Value, Cluster numbers, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f, %f, %i\n"%(y[n],labels[n],(n+1)))
    file.close()
    import pylab as pl
    from itertools import cycle
    # plot the results along with the labels
    k_means_cluster_centers = k_means.cluster_centers_
    fig, ax = plt.subplots()
    im=ax.scatter(X[:, 0], X[:, 1], c=labels, marker='.')
    for k in xrange(n_clusters):
        my_members = labels == k
        cluster_center = k_means_cluster_centers[k]
        ax.plot(cluster_center[0], cluster_center[1], 'w', color='b',
                marker='x', markersize=6)
    fig.colorbar(im)
    plt.title("Number of clusters: %i"%n_clusters)
    save = Output + "kmeans.png"
    plt.savefig(save)
    lvltrace.lvltrace("LVLsortie dans kmeans unsupervised")
Пример #16
0
def forest_of_trees(input_file, Output):

    import numpy as np

    from sklearn.datasets import make_classification
    from sklearn.ensemble import ExtraTreesClassifier
    lvltrace.lvltrace("LVLEntree dans forest_of_trees dans feature_selection")

    # Build a classification task using 3 informative features
    ncol = tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol - 1))
    X = data[:, 1:]
    y = data[:, 0]
    #print X
    #print y
    sample_size, n_features = X.shape

    # Build a forest and compute the feature importances
    forest = ExtraTreesClassifier(n_estimators=250, random_state=0)

    forest.fit(X, y)
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]

    results = Output + "forest_of_tree.txt"
    file = open(results, "w")
    file.write("Feature Ranking\n")

    # Print the feature ranking
    #print("Feature ranking:")

    for f in range(n_features):
        file.write("%d. feature %d (%f)\n" %
                   (f + 1, indices[f] + 1, importances[indices[f]]))
        #print("%d. feature %d (%f)" % (f + 1, indices[f]+1, importances[indices[f]]))
    file.close()
    # Plot the feature importances of the forest
    import pylab as pl
    pl.figure()
    pl.title("Feature importances: Forest of trees applied to Layers + Types")
    pl.bar(range(n_features),
           importances[indices],
           color="r",
           yerr=std[indices],
           align="center")
    pl.xticks(range(n_features), indices + 1)
    pl.axis('tight')
    #pl.xlim([-1, 73])
    save = Output + "forest_of_tree.png"
    pl.savefig(save)
    lvltrace.lvltrace("LVLSortie dans forest_of_trees dans feature_selection")
def similarity_matrix(Corrected_Features, similarity):
    file = open(similarity, "w")
    writer = csv.writer(file, lineterminator=',')
    for root, dirs, files in os.walk(Corrected_Features):
        for j in files:
            if not j.startswith('.'):
                input_1 = Corrected_Features + j
                ncol1 = tools.file_col_coma(input_1) - 1
                data = np.loadtxt(input_1, delimiter=',', usecols=range(
                    ncol1))  # ncol-1 because we skip the neuron manes
                X1 = data[:, :]
                y1 = data[:, 0].astype(np.int)  # Labels (class)
                mtype1 = [0 for x in xrange(ncol1 - 1)]
                f = [0 for x in xrange(ncol1 - 1)]
                for col in xrange(1, ncol1):
                    mtype1[col - 1] = np.mean(X1[:,
                                                 col])  #mean of each feature
                for root, dirs, files in os.walk(Corrected_Features):
                    for i in files:
                        if not i.startswith('.'):
                            input_2 = Corrected_Features + i
                            ncol2 = tools.file_col_coma(input_2) - 1
                            data = np.loadtxt(
                                input_2, delimiter=',', usecols=range(ncol2)
                            )  # ncol-1 because we skip the neuron names
                            X2 = data[:, :]
                            y2 = data[:, 0].astype(np.int)  # Labels (class)
                            mtype2 = [0 for x in xrange(ncol2 - 1)]
                            for col in xrange(1, ncol2):
                                mtype2[col - 1] = np.mean(
                                    X2[:, col])  #mean of each feature
                            for col in xrange(1, ncol2):
                                f[col - 1] = np.abs(mtype2[col - 1] -
                                                    mtype1[col - 1])
                            similarity = np.mean(f)
                            file.write("%f," % similarity)
                file.write("\n")
    file.close()
def forest_of_trees(input_file,Output):

    import numpy as np

    from sklearn.datasets import make_classification
    from sklearn.ensemble import ExtraTreesClassifier
    lvltrace.lvltrace("LVLEntree dans forest_of_trees dans feature_selection")

    # Build a classification task using 3 informative features
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    #print X
    #print y
    sample_size, n_features = X.shape

    # Build a forest and compute the feature importances
    forest = ExtraTreesClassifier(n_estimators=250,
                                  random_state=0)

    forest.fit(X, y)
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]

    results = Output+"forest_of_tree.txt"
    file = open(results, "w")
    file.write("Feature Ranking\n")
    
    # Print the feature ranking
    #print("Feature ranking:")

    for f in range(n_features):
        file.write("%d. feature %d (%f)\n" % (f + 1, indices[f]+1, importances[indices[f]]))
        #print("%d. feature %d (%f)" % (f + 1, indices[f]+1, importances[indices[f]]))
    file.close()
    # Plot the feature importances of the forest
    import pylab as pl
    pl.figure()
    pl.title("Feature importances: Forest of trees applied to Layers + Types")
    pl.bar(range(n_features), importances[indices],
           color="r", yerr=std[indices], align="center")
    pl.xticks(range(n_features), indices+1)
    pl.axis('tight')
    #pl.xlim([-1, 73])
    save=Output+"forest_of_tree.png"
    pl.savefig(save)
    lvltrace.lvltrace("LVLSortie dans forest_of_trees dans feature_selection")
Пример #19
0
def gmm(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans gmm unsupervised")
    print "#########################################################################################################\n"
    print "GMM"
    print "#########################################################################################################\n"
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    
    # Fit a mixture of gaussians with EM using five components
    gmm = mixture.GMM(n_components=5, covariance_type='spherical', init_params = 'wmc')
    gmm.fit(X)

    # Fit a dirichlet process mixture of gaussians using five components
    dpgmm = mixture.DPGMM(n_components=5, covariance_type='spherical',init_params = 'wmc')
    dpgmm.fit(X)

    color_iter = itertools.cycle(['r', 'g', 'b', 'c', 'm', 'b','g','r','c','m','y','k','b','g','r','c','m','y','k','b','g','r','c','m','y','k','b','g','r','c','m','y','k'])

    for i, (clf, title) in enumerate([(gmm, 'GMM'),
                                      (dpgmm, 'Dirichlet Process GMM')]):
        splot = pl.subplot(2, 1, 1 + i)
        Y_ = clf.predict(X)
        for i, (mean, covar, color) in enumerate(zip(
                                                     clf.means_, clf._get_covars(), color_iter)):
            v, w = linalg.eigh(covar)
            u = w[0] / linalg.norm(w[0])
            # as the DP will not use every component it has access to
            # unless it needs it, we shouldn't plot the redundant
            # components.
            if not np.any(Y_ == i):
                continue
            pl.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color)
            
            # Plot an ellipse to show the Gaussian component
            angle = np.arctan(u[1] / u[0])
            angle = 180 * angle / np.pi  # convert to degrees
            ell = mpl.patches.Ellipse(mean, v[0], v[1], 180 + angle, color=color)
            ell.set_clip_box(splot.bbox)
            ell.set_alpha(0.5)
            splot.add_artist(ell)
        pl.xticks(())
        pl.yticks(())
        pl.title(title)
    save = Output + "gmm.png"
    plt.savefig(save)
    lvltrace.lvltrace("LVLSortie dans gmm unsupervised")
def multinomialNB(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans multinomialNB")
    try:
        ncol=tools.file_col_coma(input_file)
        data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
        X = data[:,1:]
        y = data[:,0]
        n_samples, n_features = X.shape
        # Instantiate the estimator
        clf = MultinomialNB()
        # Fit the estimator to the data
        clf.fit(X, y)
        # Use the model to predict the last several labels
        y_pred = clf.predict(X)
        print "#########################################################################################################\n"
        print "Multinomial Naive Bayes estimator accuracy "
        print "classification accuracy:", metrics.accuracy_score(y, y_pred)
        print "precision:", metrics.precision_score(y, y_pred)
        print "recall:", metrics.recall_score(y, y_pred)
        print "f1 score:", metrics.f1_score(y, y_pred)
        print "\n"
        print "#########################################################################################################\n"
        results = Output+"Multinomial_NB_metrics.txt"
        file = open(results, "w")
        file.write("Multinomial Naive Bayes estimator accuracy\n")
        file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred))
        file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred))
        file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred))
        file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred))
        file.write("True Value, Predicted Value, Iteration\n")
        for n in xrange(len(y)):
            file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1)))
        file.close()
        title = "Multinomial Naive Bayes"
        save = Output + "Multinomial_NB_confusion_matrix.png"
        plot_confusion_matrix(y, y_pred,title,save)
    except (ValueError):
        if configuration.normalization == 'normalize':
            results = Output+"Multinomial_NB_metrics.txt"
            file = open(results, "w")
            file.write("In configuration.py file, normalization=normalize -- Input Values must be superior to 0\n")
            file.close()
    lvltrace.lvltrace("LVLSortie dans multinomialNB")
def lda(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans lda")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    #lda=LDA(n_components=2)
    lda=LDA()
    lda.fit(X,y)
    X_LDA = lda.transform(X)
    y_pred = lda.predict(X)
    print "#########################################################################################################\n"
    print "Linear Discriminant Analysis Accuracy "
    print "classification accuracy:", metrics.accuracy_score(y, y_pred)
    print "precision:", metrics.precision_score(y, y_pred)
    print "recall:", metrics.recall_score(y, y_pred)
    print "f1 score:", metrics.f1_score(y, y_pred)
    print "\n"
    print "#########################################################################################################\n"
    results = Output+"LDA_metrics.txt"
    file = open(results, "w")
    file.write("Linear Discriminant Analaysis estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1)))
    file.close()
    title = "LDA"
    save = Output + "LDA_confusion_matrix.png"
    plot_confusion_matrix(y, y_pred,title,save)
    # plot the results along with the labels
    fig, ax = plt.subplots()
    im = ax.scatter(X_LDA[:, 0], X_LDA[:, 1], c=y)
    fig.colorbar(im);
    save_lda = Output + "LDA_plot.png"
    plt.savefig(save_lda)
    plt.close()
    lvltrace.lvltrace("LVLSortie dans lda")
def lda(input_file,Output,test_size):
    lvltrace.lvltrace("LVLEntree dans lda split_test")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    print X_train.shape, X_test.shape
    lda=LDA(n_components=2)
    lda.fit(X_train,y_train)
    X_LDA = lda.transform(X_train)
    print "shape of result:", X_LDA.shape
    y_pred = lda.predict(X_test)
    print "Linear Discriminant Analysis Accuracy "
    print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
    print "precision:", metrics.precision_score(y_test, y_pred)
    print "recall:", metrics.recall_score(y_test, y_pred)
    print "f1 score:", metrics.f1_score(y_test, y_pred)
    #LVLprint "\n"
    results = Output+"LDA_metrics_test.txt"
    file = open(results, "w")
    file.write("Linear Discriminant Analaysis estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y_test)):
        file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
    file.close()
    title = "LDA %f"%test_size
    save = Output + "LDA_confusion_matrix"+"_%s.png"%test_size
    plot_confusion_matrix(y_test, y_pred,title,save)
    # plot the results along with the labels
    fig, ax = plt.subplots()
    im = ax.scatter(X_LDA[:, 0], X_LDA[:, 1], c=y_train)
    fig.colorbar(im);
    save_lda = Output + "LDA_plot_test"+"_%s.png"%test_size
    plt.savefig(save_lda)
    plt.close()
    lvltrace.lvltrace("LVLSortie dans lda split_test")
def qda(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans qda")
    try:
        ncol=tools.file_col_coma(input_file)
        data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
        X = data[:,1:]
        y = data[:,0]
        n_samples, n_features = X.shape
        qda=QDA()
        qda.fit(X,y)
        y_pred = qda.predict(X)
        print "#########################################################################################################\n"
        print "Quadratic Discriminant Analysis Accuracy "
        print "classification accuracy:", metrics.accuracy_score(y, y_pred)
        print "precision:", metrics.precision_score(y, y_pred)
        print "recall:", metrics.recall_score(y, y_pred)
        print "f1 score:", metrics.f1_score(y, y_pred)
        print "\n"
        print "#########################################################################################################\n"
        results = Output+"QDA_metrics.txt"
        file = open(results, "w")
        file.write("Quadratic Discriminant Analaysis estimator accuracy\n")
        file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred))
        file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred))
        file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred))
        file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred))
        file.write("\n")
        file.write("True Value, Predicted Value, Iteration\n")
        for n in xrange(len(y)):
            file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1)))
        file.close()
        title = "QDA"
        save = Output + "QDA_confusion_matrix.png"
        plot_confusion_matrix(y, y_pred,title,save)
    except (AttributeError):
        if configuration.normalization == 'normalize':
            results = Output+"Multinomial_NB_metrics.txt"
            file = open(results, "w")
            file.write("In configuration.py file, normalization='normalize' -- Input Values must be superior to 0\n")
            file.close()
    lvltrace.lvltrace("LVLSortie dans qda")
def qda(input_file,Output,test_size):
    lvltrace.lvltrace("LVLEntree dans qda split_test")
    try:
        ncol=tools.file_col_coma(input_file)
        data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
        X = data[:,1:]
        y = data[:,0]
        n_samples, n_features = X.shape
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
        print X_train.shape, X_test.shape
        lda=QDA()
        lda.fit(X_train,y_train)
        y_pred = lda.predict(X_test)
        print "Quadratic Discriminant Analysis Accuracy "
        print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
        print "precision:", metrics.precision_score(y_test, y_pred)
        print "recall:", metrics.recall_score(y_test, y_pred)
        print "f1 score:", metrics.f1_score(y_test, y_pred)
        #LVLprint "\n"
        results = Output+"QDA_metrics_test.txt"
        file = open(results, "w")
        file.write("Quadratic Discriminant Analaysis estimator accuracy\n")
        file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
        file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
        file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
        file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
        file.write("\n")
        file.write("True Value, Predicted Value, Iteration\n")
        for n in xrange(len(y_test)):
            file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
        file.close()
        title = "QDA %f"%test_size
        save = Output + "QDA_confusion_matrix"+"_%s.png"%test_size
        plot_confusion_matrix(y_test, y_pred,title,save)
    except (AttributeError):
        if configuration.normalization == 'normalize':
            results = Output+"Multinomial_NB_metrics_test.txt"
            file = open(results, "w")
            file.write("In configuration.py file, normalization='normalize' -- Input Values must be superior to 0\n")
            file.close()
    lvltrace.lvltrace("LVLSortie dans qda split_test")
def Radius_Neighbors(input_file,Output,test_size):
    lvltrace.lvltrace("LVLEntree dans radius_kneighbors split_test")
    try:
        ncol=tools.file_col_coma(input_file)
        data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
        X = data[:,1:]
        y = data[:,0]
        n_samples, n_features = X.shape
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
        print X_train.shape, X_test.shape
        clf = RadiusNeighborsClassifier(radius=0.001, weights='uniform', algorithm='auto')
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print "Radius Neighbors accuracy "
        print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
        print "precision:", metrics.precision_score(y_test, y_pred)
        print "recall:", metrics.recall_score(y_test, y_pred)
        print "f1 score:", metrics.f1_score(y_test, y_pred)
        print "\n"
        results = Output+"Raidus_Neighbors_metrics_test.txt"
        file = open(results, "w")
        file.write("Radius Neighbors estimator accuracy\n")
        file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
        file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
        file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
        file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
        file.write("\n")
        file.write("True Value, Predicted Value, Iteration\n")
        for n in xrange(len(y_test)):
            file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
        file.close()
        title = "Radius Neighbors %f"%test_size
        save = Output + "Radius_Neighbors_confusion_matrix"+"_%s.png"%test_size
        plot_confusion_matrix(y_test, y_pred,title,save)
    except (ValueError):
        results = Output+"Raidus_Neighbors_metrics_test.txt"
        file = open(results, "w")
        file.write("In configuration.py file:  No neighbors found for test samples, you can try using larger radius, give a label for outliers, consider or removing them from your dataset.")
        file.close()
    lvltrace.lvltrace("LVLSortie dans radius_kneighbors split_test")
Пример #26
0
def meanshift(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans meanshift unsupervised")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    sample_size, n_features = X.shape
    # Compute clustering with MeanShift
    # The following bandwidth can be automatically detected using
    bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=sample_size)
    ms = MeanShift()
    ms.fit(X)
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)
    print "#########################################################################################################\n"
    print "Mean Shift"
    print("number of estimated clusters : %d" % n_clusters_)
    #print labels
    #print y
    print('homogeneity_score: %f'%metrics.homogeneity_score(y, labels))
    print('completeness_score: %f'%metrics.completeness_score(y, labels))
    print('v_measure_score: %f'%metrics.v_measure_score(y, labels))
    print('adjusted_rand_score: %f'%metrics.adjusted_rand_score(y, labels))
    print('adjusted_mutual_info_score: %f'%metrics.adjusted_mutual_info_score(y,  labels))
    try:
        print('silhouette_score: %f'%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size))
    except (ValueError):
        print "ValueError: Number of labels is 1 but should be more than 2and less than n_samples - 1"
    print "\n"
    print "#########################################################################################################\n"
    results = Output+"mean_shift_metrics.txt"
    file = open(results, "w")
    file.write("Mean Shift\n")
    file.write("Homogeneity Score: %f\n"%metrics.homogeneity_score(y, labels))
    file.write("Completeness Score: %f\n"%metrics.completeness_score(y, labels))
    file.write("V-Measure: %f\n"%metrics.v_measure_score(y, labels))
    file.write("The adjusted Rand index: %f\n"%metrics.adjusted_rand_score(y, labels))
    file.write("Adjusted Mutual Information: %f\n"%metrics.adjusted_mutual_info_score(y,  labels))
    try:
        file.write("Silhouette Score: %f\n"%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size))
    except (ValueError):
        file.write("ValueError: Number of labels is 1 but should be more than 2and less than n_samples - 1")
    file.write("\n")
    file.write("True Value, Clusters, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f,%f,%i\n"%(y[n],labels[n],(n+1)))
    file.close()

    # Plot result
    import pylab as pl
    from itertools import cycle
    fig, ax = plt.subplots()
    im=ax.scatter(X[:, 0], X[:, 1], c=labels, marker='.')
    for k in xrange(n_clusters_):
        my_members = labels == k
        cluster_center = cluster_centers[k]
        #print cluster_center[0], cluster_center[1]
        ax.plot(cluster_center[0], cluster_center[1], 'w', color='b',
                marker='x', markersize=6)
    fig.colorbar(im);
    plt.title('Estimated number of clusters: %d' % n_clusters_)
    save = Output + "mean_shift.png"
    plt.savefig(save)
    lvltrace.lvltrace("LVLSortie dans meanshift unsupervised")
Пример #27
0
def univariate(input_file, Output, percentile):
    ###############################################################################
    # import some data to play with
    lvltrace.lvltrace("LVLEntree dans univariate dans feature_selection")
    ncol = tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol - 1))
    X = data[:, 1:]
    y = data[:, 0]
    sample_size, n_features = X.shape

    ###############################################################################
    pl.figure(1)
    pl.clf()

    X_indices = np.arange(X.shape[-1])
    #print X_indices

    ###############################################################################
    # Univariate feature selection with F-test for feature scoring
    # We use the default selection function: the 10% most significant features
    selector = SelectPercentile(f_classif, percentile=percentile)
    selector.fit(X, y)
    scores = -np.log10(selector.pvalues_)
    scores /= scores.max()
    pl.bar(X_indices - .45,
           scores,
           width=.2,
           label=r'Univariate score ($-Log(p_{value})$)',
           color='g')

    ###############################################################################
    # Compare to the weights of an SVM
    clf = svm.SVC(kernel='linear')
    clf.fit(X, y)

    svm_weights = (clf.coef_**2).sum(axis=0)
    svm_weights /= svm_weights.max()

    pl.bar((X_indices + 1) - .25,
           svm_weights,
           width=.2,
           label='SVM weight',
           color='r')

    clf_selected = svm.SVC(kernel='linear')
    clf_selected.fit(selector.transform(X), y)

    svm_weights_selected = (clf_selected.coef_**2).sum(axis=0)
    svm_weights_selected /= svm_weights_selected.max()

    pl.bar(X_indices[selector.get_support()] - .05,
           svm_weights_selected,
           width=.2,
           label='SVM weights after selection',
           color='b')

    pl.title("Feature selection")
    pl.xlabel('Feature number')
    pl.yticks(())
    pl.axis('tight')
    pl.legend(loc='upper right')
    save = Output + "univariate.png"
    pl.savefig(save)
    # Print the feature ranking
    results = Output + "univariate.txt"
    file = open(results, "w")
    file.write("Feature Ranking\n")
    #print len(X_indices[selector.get_support()])
    for i in xrange(len(X_indices[selector.get_support()])):
        #print i
        #print (X_indices[selector.get_support()][i]+1)
        #print svm_weights_selected[i]
        file.write("%f,%f\n" % ((X_indices[selector.get_support()][i] + 1),
                                svm_weights_selected[i]))
    file.close()

    #print("Feature ranking:")
    #print (X_indices[selector.get_support()] +1)
    #print svm_weights_selected
    lvltrace.lvltrace("LVLSortie dans univariate dans feature_selection")
def univariate(input_file, Output, percentile):
    ###############################################################################
    # import some data to play with
    lvltrace.lvltrace("LVLEntree dans univariate dans feature_selection")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    sample_size, n_features = X.shape

    ###############################################################################
    pl.figure(1)
    pl.clf()

    X_indices = np.arange(X.shape[-1])
    #print X_indices

    ###############################################################################
    # Univariate feature selection with F-test for feature scoring
    # We use the default selection function: the 10% most significant features
    selector = SelectPercentile(f_classif, percentile=percentile)
    selector.fit(X, y)
    scores = -np.log10(selector.pvalues_)
    scores /= scores.max()
    pl.bar(X_indices - .45, scores, width=.2,
           label=r'Univariate score ($-Log(p_{value})$)', color='g')

    ###############################################################################
    # Compare to the weights of an SVM
    clf = svm.SVC(kernel='linear')
    clf.fit(X, y)

    svm_weights = (clf.coef_ ** 2).sum(axis=0)
    svm_weights /= svm_weights.max()

    pl.bar((X_indices+1) - .25, svm_weights, width=.2, label='SVM weight', color='r')

    clf_selected = svm.SVC(kernel='linear')
    clf_selected.fit(selector.transform(X), y)

    svm_weights_selected = (clf_selected.coef_ ** 2).sum(axis=0)
    svm_weights_selected /= svm_weights_selected.max()

    pl.bar(X_indices[selector.get_support()] - .05, svm_weights_selected, width=.2,
           label='SVM weights after selection', color='b')
           
    pl.title("Feature selection")
    pl.xlabel('Feature number')
    pl.yticks(())
    pl.axis('tight')
    pl.legend(loc='upper right')
    save=Output+"univariate.png"
    pl.savefig(save)
    # Print the feature ranking
    results = Output+"univariate.txt"
    file = open(results, "w")
    file.write("Feature Ranking\n")
    #print len(X_indices[selector.get_support()])
    for i in xrange(len(X_indices[selector.get_support()])):
        #print i
        #print (X_indices[selector.get_support()][i]+1)
        #print svm_weights_selected[i]
        file.write("%f,%f\n"%((X_indices[selector.get_support()][i]+1),svm_weights_selected[i]))
    file.close()
    
    #print("Feature ranking:")
    #print (X_indices[selector.get_support()] +1)
    #print svm_weights_selected
    lvltrace.lvltrace("LVLSortie dans univariate dans feature_selection")
Пример #29
0
def KMeans_PCA(input_file, n_clusters, Output):
    lvltrace.lvltrace("LVLEntree dans KMeans_PCA unsupervised")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    sample_size, n_features = X.shape
    reduced_data = PCA(n_components=2).fit_transform(X)
    k_means = KMeans(init='k-means++', n_clusters=n_clusters, n_init=50)
    k_means.fit(reduced_data)
    labels = k_means.labels_
    print "#########################################################################################################\n"
    print "K-MEANS on PCA-reduced data"
    #print labels
    #print y
    print('homogeneity_score: %f'%metrics.homogeneity_score(y, labels))
    print('completeness_score: %f'%metrics.completeness_score(y, labels))
    print('v_measure_score: %f'%metrics.v_measure_score(y, labels))
    print('adjusted_rand_score: %f'%metrics.adjusted_rand_score(y, labels))
    print('adjusted_mutual_info_score: %f'%metrics.adjusted_mutual_info_score(y,  labels))
    print('silhouette_score: %f'%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size))
    print "\n"
    print "#########################################################################################################\n"
    results = Output+"kmeans_PCA_metrics.txt"
    file = open(results, "w")
    file.write("K-Means clustering on the PCA-reduced data\n")
    file.write("Homogeneity Score: %f\n"%metrics.homogeneity_score(y, labels))
    file.write("Completeness Score: %f\n"%metrics.completeness_score(y, labels))
    file.write("V-Measure: %f\n"%metrics.v_measure_score(y, labels))
    file.write("The adjusted Rand index: %f\n"%metrics.adjusted_rand_score(y, labels))
    file.write("Adjusted Mutual Information: %f\n"%metrics.adjusted_mutual_info_score(y,  labels))
    file.write("Silhouette Score: %f\n"%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size))
    file.write("\n")
    file.write("True Value, Clusters, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f,%f,%i\n"%(y[n],labels[n],(n+1)))
    file.close()
    # Step size of the mesh. Decrease to increase the quality of the VQ.
    h = .02     # point in the mesh [x_min, m_max]x[y_min, y_max]
    # Plot the decision boundary. For that, we will assign a color to each
    x_min, x_max = reduced_data[:, 0].min() , reduced_data[:, 0].max()
    y_min, y_max = reduced_data[:, 1].min() , reduced_data[:, 1].max()
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    # Obtain labels for each point in mesh. Use last trained model.
    Z = k_means.predict(np.c_[xx.ravel(), yy.ravel()])
    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    pl.figure(1)
    pl.clf()
    pl.imshow(Z, interpolation='nearest',
              extent=(xx.min(), xx.max(), yy.min(), yy.max()),
              cmap=pl.cm.Paired,
              aspect='auto', origin='lower')
    pl.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
    # Plot the centroids as a white X
    centroids = k_means.cluster_centers_
    pl.scatter(centroids[:, 0], centroids[:, 1],
               marker='x', s=169, linewidths=3,
               color='w', zorder=10)
    pl.title('K-means clustering on the PCA-reduced data\n'
             'Number of clusters: %i'%n_clusters)
    pl.xlim(x_min, x_max)
    pl.ylim(y_min, y_max)
    pl.xticks(())
    pl.yticks(())
    save = Output + "kmeans_PCA.png"
    pl.savefig(save)
    lvltrace.lvltrace("LVLSortie dans KMeans_PCA unsupervised")
Пример #30
0
def affinitypropagation(input_file,type,pref,Output):
    lvltrace.lvltrace("LVLEntree dans affinitypropagation unsupervised")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))

    X = data[:,1:]
    #print (" ici X vaut ")
    #print X
    #print (" fin de print X")
    labels_true = data[:,0]
    # A tester
    if type == 'spearmanr':
        X = scipy.stats.stats.spearmanr(X,axis=1)[0]
    else:
        if type == 'euclidean':
            X = -euclidean_distances(X, squared=True)
        else:
            print "something wrong"
    if pref == 'median':
        # A tester entre min ou median
        preference = np.median(X)
    else:
        if pref == 'mean':
            preference = np.mean(X)
        else:
            if pref == 'min':
                preference = np.min(X)
            else:
                print "something wrong"
    print "#########################################################################################################\n"
    print "Affinity Propagation"
    print preference
    n_samples, n_features = X.shape
    cluster_centers_indices, labels = affinity_propagation(X, preference=preference)
    #print cluster_centers_indices
    n_clusters_ = len(cluster_centers_indices)
    n_clusters_ = len(cluster_centers_indices)
    #print labels_true
    #print labels
    print('Estimated number of clusters: %d' % n_clusters_)
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
    print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
    print("Adjusted Rand Index: %0.3f"
          % metrics.adjusted_rand_score(labels_true, labels))
    print("Adjusted Mutual Information: %0.3f"
          % metrics.adjusted_mutual_info_score(labels_true, labels))
    print("Silhouette Coefficient: %0.3f"
          % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
    print "\n"
    print "#########################################################################################################\n"
    results = Output+"affinity_propagation.txt"
    file = open(results, "w")
    file.write("Affinity Propagation\n")
    file.write("Homogeneity Score: %f\n"%metrics.homogeneity_score(labels_true, labels))
    file.write("Completeness Score: %f\n"%metrics.completeness_score(labels_true, labels))
    file.write("V-Measure: %f\n"%metrics.v_measure_score(labels_true, labels))
    file.write("The adjusted Rand index: %f\n"%metrics.adjusted_rand_score(labels_true, labels))
    file.write("Adjusted Mutual Information: %f\n"%metrics.adjusted_mutual_info_score(labels_true,  labels))
    file.write("Silhouette Score: %f\n"%metrics.silhouette_score(X, labels, metric='sqeuclidean'))
    file.write("\n")
    file.write("True Value, Clusters, Iteration\n")
    for n in xrange(len(labels_true)):
        file.write("%f,%f,%i\n"%(labels_true[n],labels[n],(n+1)))
    file.close()
    
    # Plot result
    import pylab as pl
    from itertools import cycle
    pl.close('all')
    pl.figure(1)
    pl.clf()
    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbg')
    for k, col in zip(range(n_clusters_), colors):
        class_members = labels == k
        cluster_center = X[cluster_centers_indices[k]]
        pl.plot(X[class_members, 0], X[class_members, 1], col + '.')
        pl.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
                markeredgecolor='k', markersize=14)
        for x in X[class_members]:
            pl.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)
    pl.title('Estimated number of clusters: %d' % n_clusters_)
    save = Output + "affinity_propagation.png"
    plt.savefig(save)
    lvltrace.lvltrace("LVLSortie dans affinitypropagation unsupervised")
Пример #31
0
def dbscan(input_file, Output):
    lvltrace.lvltrace("LVLEntree dans dbscan unsupervised")
    # Generate sample data
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    labels_true = data[:,0]
    #X = StandardScaler().fit_transform(Y)
    # Compute DBSCAN
    db = DBSCAN().fit(X)
    core_samples = db.core_sample_indices_
    labels = db.labels_
    print "#########################################################################################################\n"
    print "DBSCAN"
    print labels_true
    print labels
    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    print('Estimated number of clusters: %d' % n_clusters_)
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
    print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
    print("Adjusted Rand Index: %0.3f"
          % metrics.adjusted_rand_score(labels_true, labels))
    print("Adjusted Mutual Information: %0.3f"
          % metrics.adjusted_mutual_info_score(labels_true, labels))
    print("Silhouette Coefficient: %0.3f"
          % metrics.silhouette_score(X, labels))
    print "\n"
    print "#########################################################################################################\n"
    results = Output+"dbscan.txt"
    file = open(results, "w")
    file.write("DBSCAN\n")
    file.write("Homogeneity Score: %f\n"%metrics.homogeneity_score(y, labels))
    file.write("Completeness Score: %f\n"%metrics.completeness_score(y, labels))
    file.write("V-Measure: %f\n"%metrics.v_measure_score(y, labels))
    file.write("The adjusted Rand index: %f\n"%metrics.adjusted_rand_score(y, labels))
    file.write("Adjusted Mutual Information: %f\n"%metrics.adjusted_mutual_info_score(y,  labels))
    file.write("Silhouette Score: %f\n"%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size))
    file.write("\n")
    file.write("True Value, Clusters, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f,%f,%i\n"%(y[n],labels[n],(n+1)))
    file.close()
    
    # Plot result
    import pylab as pl
    # Black removed and is used for noise instead.
    unique_labels = set(labels)
    colors = pl.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
    for k, col in zip(unique_labels, colors):
        if k == -1:
            # Black used for noise.
            col = 'k'
            markersize = 6
        class_members = [index[0] for index in np.argwhere(labels == k)]
        cluster_core_samples = [index for index in core_samples
                                if labels[index] == k]
        for index in class_members:
            x = X[index]
            if index in core_samples and k != -1:
                markersize = 14
            else:
                markersize = 6
            pl.plot(x[0], x[1], 'o', markerfacecolor=col,
                    markeredgecolor='k', markersize=markersize)
    pl.title('Estimated number of clusters: %d' % n_clusters_)
    save = Output + "dbscan.png"
    plt.savefig(save)
    lvltrace.lvltrace("LVLSortie dans dbscan unsupervised")