示例#1
0
def main():

    # reads files <trainSet> <testSet> as command line argument
    trainSet, testSet, stopwords_bn = read_files()

    # True = Binary Sentiment Classification / False = Six Way Topic classification(multi-class label)
    use_sentiment = False

    # divides the files into tokenized documents and class labels
    # this is for SVM
    trainDoc, trainClass = modify_corpus(trainSet, use_sentiment)
    testDoc, testClass = modify_corpus(testSet, use_sentiment)

    # show the distribution of classes in training and testing set
    distribution(trainClass, testClass)

    # Running the best model based among 3 (if you want to see the output of every model then uncomment the above function)
    print("\n\n Running the best Model - Non-Linear SVM:")
    run_best_model(trainDoc, trainClass, testDoc, testClass, stopwords_bn, use_sentiment)

    # Run both linear and no-linear classifier with different C/Gamma values
    print("\n\n Do you want to See the Output of all variants of SVM classifier?:")
    c = str(input("[Y/N]:"))
    if c =='Y' or c == 'y':
        # run all the 3 classifiers
        run_all_classifiers(trainDoc, trainClass, testDoc, testClass, stopwords_bn, use_sentiment)


    # Run both linear and no-linear classifier with different C/Gamma values
    print("\n\n Do you want to run K-means clustering?:")
    c = str(input("[Y/N]:"))
    if c =='Y' or c == 'y':
        # calling K-Means Clustering doc, label, use_sentiment = False (sixway topic classification)
        K_Means.run_all_classifiers(trainDoc+testDoc, trainClass+testClass, stopwords_bn, use_sentiment)


    # Run Multinomial Naive Bayes Classifier
    print("\n\n Do you want to run  Multinomial Naive Bayes Classifier?:")
    c = str(input("[Y/N]:"))
    if c =='Y' or c == 'y':
        # calling NB classifier, use_sentiment = False (sixway topic classification)
        Naive_Bayes.run_all_classifiers(trainDoc, trainClass, testDoc, testClass, stopwords_bn, use_sentiment)


    # Run Multinomial Naive Bayes Classifier
    print("\n\n Do you want to run  Decisiton Tree/KNN Classifier?:")
    c = str(input("[Y/N]:"))
    if c =='Y' or c == 'y':
        # calling NB classifier, use_sentiment = False (sixway topic classification)
        DT_KNN.run_all_classifiers(trainDoc, trainClass, testDoc, testClass, stopwords_bn, use_sentiment)
示例#2
0
 def start(self, data):
     print("from html req:", data)
     result = ""
     if len(data) <= 1:
         result = "<p>出错</p>"
     else:
         name = data.get("name", "")
         if name == "expansion_data":
             result = smote_expansion_data.expansion_data()
         elif name == "smote_compare":
             result = smote_compare.draw_pic()
         elif name == "DBSCAN":
             result = DBSCAN.deal(data.get("all_file", ""))
         elif name == const.DECISION or name == const.RANDOM_FOREST:
             result = decision.deal(data)
         elif name == "XGBoost":
             result = XGBoost.deal_real(data)
         elif name == "KMeans":
             result = K_Means.deal(data["all_file"])
     self.send_response(200)
     self.send_header("Content-type", "text/html;charset = UTF-8")
     self.send_header("Access-Control-Allow-Origin", "*")
     self.end_headers()
     print("send to html response : {}".format(result))
     self.wfile.write(result.encode())
def total():
    check_dic()

    csv_dealer.csv_deal_to_file("./data/training_csv/",no_total=1,to_addr="./data/training_csv_cut")
    csv_dealer.csv_deal_to_file('./data/test_csv/',no_total=1,to_addr='./data/test_csv_cut')
    csv_dealer.csv_deal_to_file('./data/final_test/',no_total=1,to_addr='./data/final_test_cut')

    mat_list = nmf_sklearn.generate_new_mat_list('./data/training_csv')
    total_mat = csv_dealer.mat_list_to_total(mat_list)
    np.savetxt("./data/total.csv", total_mat, delimiter=',', fmt="%f")
    w, h = nmf_sklearn.nmf_sklearn(3,total_mat)
    K_Means.k_means(w,250,new_file_addr='./data/class_list.txt')

    probability.run()
    get_Sit.run()
    cal_f1.run()
def clustering_and_scheduling(layout, channel_losses_mat, n_links_on, clustering_method):
    N = np.shape(layout)[0]
    assert np.shape(layout) == (N, 4)
    assert np.shape(channel_losses_mat) == (N, N)
    assert 1 <= n_links_on <= N
    if (clustering_method == "Spectral Clustering"):
        clusters_one_layout = Spectral_Clustering.clustering(layout, channel_losses_mat, n_links_on)
        adj_mat = Spectral_Clustering.construct_adj_mat(channel_losses_mat)
        allocs_one_layout = adjacency_mat_based_scheduling(adj_mat, clusters_one_layout)
    elif (clustering_method == "Hierarchical Clustering"):
        clusters_one_layout = Hierarchical_Clustering.clustering(layout, channel_losses_mat, n_links_on)
        adj_mat = Spectral_Clustering.construct_adj_mat(channel_losses_mat)
        allocs_one_layout = adjacency_mat_based_scheduling(adj_mat, clusters_one_layout)
    elif (clustering_method == "K-Means"):
        clusters_one_layout, centroids_one_layout = K_Means.clustering(layout, n_links_on)
        allocs_one_layout = GLI_based_scheduling(layout, centroids_one_layout, clusters_one_layout)
    elif (clustering_method == "Hierarchical Clustering EqualSize"):
        clusters_one_layout = Hierarchical_Clustering_EqualSize.clustering(layout, channel_losses_mat, n_links_on)
        adj_mat = Spectral_Clustering.construct_adj_mat(channel_losses_mat)
        allocs_one_layout = adjacency_mat_based_scheduling(adj_mat, clusters_one_layout)
    elif (clustering_method == "K-Means EqualSize"):
        clusters_one_layout, centroids_one_layout = K_Means_EqualSize.clustering(layout, n_links_on)
        allocs_one_layout = GLI_based_scheduling(layout, centroids_one_layout, clusters_one_layout)
    else:
        print("Invalid clustering method name: {}!".format(clustering_method))
        exit(1)
    return clusters_one_layout, allocs_one_layout
示例#5
0
    def apply_algorithm(self, alg_name, training_vectors, test_vectors_clean, test_vectors_anomalous):
        """Applies the specified algorithm onto the feature sets.
        """
        if alg_name == 'lof':
            result_clean, result_anomalous, result_training = LOF.local_outlier_detection(training_vectors, test_vectors_clean, test_vectors_anomalous)

        elif alg_name == 'svm':
            result_clean, result_anomalous, result_training = SVM.one_class_svm(training_vectors, test_vectors_clean, test_vectors_anomalous)

        elif alg_name == 'dbscan':
            result_clean,result_anomalous, result_training = DBSCAN.dbscan(training_vectors,test_vectors_clean,test_vectors_anomalous)
        
        elif alg_name == 'kmeans':
            result_clean,result_anomalous, result_training = K_Means.k_means(training_vectors,test_vectors_clean,test_vectors_anomalous)

        else:
            raise NameError("Invalid Algorithm Name")
        
        return result_clean,result_anomalous, result_training
示例#6
0
output.close()
print 'Parameters saved to ' + filenames['params'] + '.pkl'

#########################################################

print 'Initiating K-Means...'

k_results=list()

for i in range(numRounds):
    start=datetime.now()
    iGenerations=qk_rounds_genNum[i]

    numInits=np.int(iGenerations*numOracles*initsPercentage)

    k_centroids,k_assignment,k_timings,k_intertia=K_Means.k_means(mixture,numClusters,numInits)
    k_results.append([k_centroids,k_assignment,k_timings,k_intertia])

    round=(datetime.now() - start).total_seconds()
    print float(i+1)*100/numRounds,'%\t','round ', i,':',round,'s  -  estimated:',(float(numRounds-1)-i)*round,'s / ',(float(numRounds-1)-i)*round/60,'m'

print 'Preparing K-Means data structures...'

k_rounds=dict()
k_rounds['centroids']=list()
k_rounds['assignment']=list()
k_rounds['times']=list()
k_rounds['inertia']=list()

for i in range(numRounds):
    k_rounds['centroids'].append(k_results[i][0])
def gaussian_mixture(data,
                     Km,
                     init_method,
                     epsilon,
                     niterations,
                     plotflag,
                     r,
                     RSEED=123):
    '''

    % Template for a function to fit  a Gaussian mixture model via EM

    % using K mixture components. The data are contained in the N x d "data" matrix.

    %

    % INPUTS

    %  data: N x d real-valued data matrix

    %  K: number of clusters (mixture components)

    %  initialization_method: 1 for memberships, 2 for parameters, 3 for kmeans

    %  epsilon: convergence threshold used to detect convergence

    %  niterations (optional): maximum number of iterations to perform (default 500)

    %  plotflag (optional): equals 1 to plot parameters during learning,

    %                       0 for no plotting (default is 0)

    %  RSEED (optional): initial seed value for the random number generator

    %

    %

    % OUTPUTS

    %  gparams: K-dim structure array containing the learned mixture model parameters:

    %           gparams(k).weight = weight of component k

    %           gparams(k).mean = d-dimensional mean vector for kth component

    %           gparams(k).covariance = d x d covariance vector for kth component

    %  memberships: N x K matrix of probability memberships for "data"

    %

    %  Note: Interpretation of gparams and memberships:

    %

    %    - gparams(k).weight is the probability that a randomly selected row

    %         belongs to component (or cluster) i (so it is "cluster size")

    %

    %    - memberships(i,k) = p(cluster k | x) which is the probability

    %         (computed via Bayes rule) that vector x was generated by cluster

    %         k, according to the "generative" probabilistic model.

    '''

    # your code goes here....
    N = len(data)

    # initialize....
    llBIC = []
    Ks = []
    for K in range(1, Km):
        lls = []
        mu = [np.array([0., 0.]) for j in range(K)]
        sigma = [np.array([[0., 0.], [0., 0.]]) for j in range(K)]
        z = [1 / float(K) for i in range(K)]
        eFirst = False
        mFirst = False
        prevLikelihood = 0
        compMat = list()
        if init_method == 0:
            compMat = randomWeights(N, K)
            mFirst = True
        elif init_method == 1:
            mu, sigma = randomParams(len(data[0]), K)
            eFirst = True

        else:
            clusters = K_Means.kMeans(data, K, 1)
            mu = [clusters[j].mean for j in range(K)]
            sigma = [np.eye(len(clusters[0].mean)) for i in range(K)]
            eFirst = True
        done = False
        iterations = 0
        #plt.ion()
        iter = []
        ll = []
        while not done:

            iterations += 1

            # perform E-step...
            if eFirst:
                list1 = [[] for j in range(K)]
                maxList = []
                compMat = list()
                for d in data:
                    x = [gaussian(d, mu[k], sigma[k]) * z[k] for k in range(K)]
                    x = [j / sum(x) for j in x]
                    compMat.append(x)
                    list1[np.array(x).argmax()].append(d)
                compMat = np.array(compMat)
                #plotCluster(list1,mu,sigma)
                mFirst = True

        # perform M-step...
            if mFirst:
                #print len(compMat[:,0])
                z = [sum(compMat[:, k]) / float(len(data)) for k in range(K)]
                mu = [np.array([0., 0.]) for j in range(K)]
                maxDist = 0
                for k in range(K):
                    for j in range(len(data)):

                        mu[k] += compMat[j][k] * data[j]

                    mu[k] /= float((z[k] * len(data)))

                sigma = [np.array([[0., 0.], [0., 0.]]) for j in range(K)]
                for k in range(K):
                    p = z[k]
                    p = p * len(data)
                    for j in range(len(data)):
                        c = (data[j] - mu[k])
                        sigma[k] += compMat[j][k] * (np.mat(c).T * np.mat(c))
                    sigma[k] /= p
                eFirst = True
                if iterations > niterations:
                    done = True

            print "Iteration....", iterations
            # compute log-likelihood and print to screen.....
            log_likelihood = 0
            final_sum = 0
            for d in data:
                log_likelihood = 0
                for j in range(K):
                    log_likelihood += z[j] * gaussian(d, mu[j], sigma[j])
                final_sum += np.log(log_likelihood)
            final_sum -= ((3 * K * np.log(len(data))) / 2)
            ll.append(final_sum)
            if abs(prevLikelihood - final_sum) < epsilon:
                print "Converged after...", iterations
                done = True
            prevLikelihood = final_sum

            iter.append(iterations)
        #lls.append((ll,iter))
        llBIC.append(final_sum)
        Ks.append(K)

        # check for convergence.....

        #plt.ioff()
        #plt.show()
    return llBIC, Ks
示例#8
0
from sklearn import datasets

import K_Means, Plot

blobs = datasets.make_blobs()[0]
centers, clusters, colors, color_labels = K_Means.k_means(blobs, 3)
Plot.plot(blobs, centers, colors, color_labels)
    Investigated the performance of your neural network for different sizes of hidden layer."""

import MNIST_Loader
import RBFNetwork
import K_Means

#loadMNISTData() returns a dataset which is a list of length 70,000 containing (image, label) tuples:
    #image is the input data in the form of a 784x1 vector where each element is a normalized greyscale value
    #label is the target data in the form of a 10x1 vector with a 1 in the index of the target value
training_dataset, testing_dataset = MNIST_Loader.load_data()
num_centroids = 16
num_iterations = 100
import random
random.shuffle(training_dataset)
print("K_Means Clustering in progress...")
final_centroids, final_clusters = K_Means.k_means_clustering(num_centroids, training_dataset[:1000].copy(),num_iterations)

print("LOOKK HERE")
print(len(final_centroids))
#hyper-parameters
learning_rate = 0.1
epochs = 10
num_folds = 5

scores = RBFNetwork.k_fold_cross_validation(training_dataset[:1000], num_folds, learning_rate, epochs, final_centroids, final_clusters)
print('\nMean Classification Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

print("\nTesting with MNIST testing dataset")
# initialize and train our network

示例#10
0
n = np.int(n)

total_bytes = np.float((n * d + k * d + n * k) * 4)
print 'Memory used by arrays:\t',total_bytes/1024,'\tKBytes'
print '\t\t\t',total_bytes/(1024*1024),'\tMBytes'

print 'Memory used by data:  \t',n * d * 4 / 1024,'\t','KBytes'

## Generate data
#data = np.random.random((n,d)).astype(np.float32)
data, groundTruth = datasets.make_blobs(n_samples=n,n_features=d,centers=k,
                                        center_box=(-1000.0,1000.0))
data = data.astype(np.float32)

grouper = K_Means(N=n,D=d,K=k)
centroids = grouper._init_centroids(data)

times = dict()


# Distance matrix
start = timer()
dist_mat_np = grouper._np_calc_dists(data,centroids)
times["dist_mat np"] = timer() - start

start = timer()
dist_mat_cu_auto = grouper._cu_calc_dists(data,centroids,gridDim=None,
                                     blockDim=None,memManage='auto',
                                     keepDataRef=False)
times["dist_mat cuda manual"] = timer() - start
示例#11
0
import xlrd as xd
import K_Means
import numpy as np

# K-means聚类算法

# 加载数据
print('K-means clustering')
data = xd.open_workbook('data.xls')
table = data.sheets()[0]
height = table.col_values(3)[1:]
weight = table.col_values(4)[1:]
dataSet = np.vstack((height, weight)).T
# 聚类
k = 2
centroids, clusterAssment = K_Means.kmeans(dataSet, k)
# 可视化
K_Means.showCluster(dataSet, k, centroids, clusterAssment)

# 分层聚类算法

#设置分层聚类函数
print('hierarchical clustering')
linkages = ['ward', 'average', 'complete']
n_clusters_ = 2
ac = AgglomerativeClustering(linkage=linkages[2], n_clusters=n_clusters_)
# 训练数据
ac.fit(dataSet)
# 每个数据的分类
lables = ac.labels_
# 可视化
示例#12
0
pickle.dump(params, output)
output.close()
print 'Parameters saved to ' + filenames['params'] + '.pkl'

#########################################################

print 'Initiating K-Means...'

print "Each round will have ", numInits, " K-Means initializations."

k_results = list()

for i in range(numRounds):
    start = datetime.now()

    k_centroids, k_assignment, k_timings, k_intertia = K_Means.k_means(
        mixture, numClusters, numInits)
    k_results.append([k_centroids, k_assignment, k_timings, k_intertia])

    round = (datetime.now() - start).total_seconds()
    print float(
        i + 1
    ) * 100 / numRounds, '%\t', 'round ', i, ':', round, 's  -  estimated:', (
        float(numRounds - 1) - i) * round, 's / ', (float(numRounds - 1) -
                                                    i) * round / 60, 'm'

print 'Preparing K-Means data structures...'

k_rounds = dict()
k_rounds['centroids'] = list()
k_rounds['assignment'] = list()
k_rounds['times'] = list()
示例#13
0
Y = CL.Moving_Mean(X, 5)
#[CL.aritmeticky_prumer_fce(X, x, 5) for x in range(len(X))]
Z = CL.Exp_Moving_Mean(X, 5)
#[CL.suma_zleva_fce(X, x, 5) for x in range(len(X))]
XX = np.vstack((Y, Z, X)).T
XX
# np.shape(XX)
# plt.scatter(Y,X)
# plt.show()

np.shape(Z)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(Y, Z, X)
plt.show()
km.Nakresly(XX, 2, "3d")

clf = km.K_means(2)
clf.fit(XX)
predictions = []
for pred in XX:
    predictions.append(clf.Predict(pred))
print(predictions)
plt.plot(np.arange(len(X)), X, linewidth=0.5)
plt.scatter(np.arange(len(X)), X, c=predictions, cmap=plt.cm.plasma)
plt.show()

ZZ = np.load(way + "No2.npy")
ZZ[0]
plt.plot(ZZ[0], ZZ[1])
plt.show()
示例#14
0
total_bytes = np.float((n * d + k * d + n * k) * 4)
print 'Memory used by arrays:\t', total_bytes / 1024, '\tKBytes'
print '\t\t\t', total_bytes / (1024 * 1024), '\tMBytes'

print 'Memory used by data:  \t', n * d * 4 / 1024, '\t', 'KBytes'

## Generate data
#data = np.random.random((n,d)).astype(np.float32)
data, groundTruth = datasets.make_blobs(n_samples=n,
                                        n_features=d,
                                        centers=k,
                                        center_box=(-1000.0, 1000.0))
data = data.astype(np.float32)

grouper = K_Means(N=n, D=d, K=k)
centroids = grouper._init_centroids(data)

times = dict()

# Distance matrix
start = timer()
dist_mat_np = grouper._np_calc_dists(data, centroids)
times["dist_mat np"] = timer() - start

start = timer()
dist_mat_cu_auto = grouper._cu_calc_dists(data,
                                          centroids,
                                          gridDim=None,
                                          blockDim=None,
                                          memManage='auto',