def answer(test_path):

    import warnings
    warnings.filterwarnings("ignore")

    import time
    t0 = time.time()

    from learning import process_test_data, training_data, training_answers
    from sklearn.cluster.k_means_ import KMeans
    from sklearn.linear_model.logistic import LogisticRegression

    test_data = process_test_data(test_path)

    km = KMeans()
    km.fit(training_data, training_answers)

    myNum = km.predict(test_data).item()

    numX = [1, 2, 4, 2, 7, 0, 2, 7, 4, 3, 2, 1, 4, 5, 5, 1, 3, 0, 4, 2]
    numbers = [[num] for num in numX]
    letX = [
        'a', 'a', 'o', 'a', 'o', 'o', 'a', 'a', 'o', 'a', 'a', 'o', 'a', 'o',
        'o', 'o', 'a', 'a', 'o', 'a'
    ]
    letters = [[letter] for letter in letX]

    lr = LogisticRegression()
    lr.fit(numbers, letters)

    ans = lr.predict(myNum).item()

    t1 = time.time()
    return [ans, t1 - t0]
예제 #2
0
def k_way_spectral_clustering():
    x = np.load('q2data.npy')
    A = np.load('AMatrix.npy')
    WeightMatrix = np.zeros((16, 16))
    for i in range(16):
        for j in range(16):
            if A[i][j] == 1:
                WeightMatrix[i][j] = np.exp(-1 * ((np.linalg.norm(x[i] - x[j]) ** 2)))
            else:
                WeightMatrix[i][j] = 0

    DegreeMatrix = np.sum(WeightMatrix, axis=1)
    L = DegreeMatrix - WeightMatrix
    DSquareRoot = np.diag(1.0 / (DegreeMatrix ** (0.5)))
    Lnorm = np.dot(np.dot(DSquareRoot, L), DSquareRoot)

    eigvals, eigvecs = np.linalg.eig(Lnorm)
    eigvecs = np.array(eigvecs, dtype=np.float64)
    sortedinds = eigvals.argsort()
    eigvec1, eigvec2, eigvec3, eigvec4 = eigvecs[:, 10], eigvecs[:, 11], eigvecs[:, 13], eigvecs[:, 14]

    kmeans = KMeans(n_clusters=3, init='random')
    kmeans.fit(eigvecs)
    components = kmeans.labels_
    return components
예제 #3
0
 def ClusterBalance(self, indexesToPick, stopCount, kmeansFlag=True):
     print "ClusterBalancing..."
     indexesPicked = []
     obs1 = self.observations[indexesToPick]
     obs = normalize(obs1, axis=0)
     if len(indexesToPick) != 0:
         if kmeansFlag:
             if(len(indexesToPick) < self.numClusters):
                 cluster = KMeans(init='k-means++', n_clusters=len(obs), n_init=10)
             else:
                 cluster = KMeans(init='k-means++', n_clusters=self.numClusters, n_init=10)
         else:
             if(len(indexesToPick) < self.numClusters):
                 cluster = spectral_clustering(n_clusters=len(obs), n_init=10)
             else:
                 cluster = spectral_clustering(n_clusters=self.numClusters, n_init=10)
         cluster.fit(obs)
         labels = cluster.labels_
         whenToStop = max(2, stopCount)
         count = 0
         while count != whenToStop:
             cluster_list = range(self.numClusters)
             index = 0
             for j in labels:
                 if j in cluster_list:
                     indexesPicked.append(indexesToPick[index])
                     cluster_list.remove(j)
                     count += 1
                     if count == whenToStop:
                         break
                     labels[index] = -1
                     if len(cluster_list) == 0:
                         break
                 index += 1
     return indexesPicked
예제 #4
0
def train_k_means_by_step(n_clusters, init_cluster_centers, x_array, eps):
    # eps = 1e-4
    # eps = 0.1
    # eps = 100.0
    # prev_sample = np.array(clf.cluster_centers_, np.float)
    prev_centers = init_cluster_centers
    clf = KMeans(init=prev_centers,
                 n_clusters=n_clusters,
                 n_init=1,
                 n_jobs=-1,
                 tol=eps,
                 max_iter=1)
    # if isinstance(prev_centers, str):
    #     prev_centers = clf.cluster_centers_
    clf.fit(x_array)
    new_centers = clf.cluster_centers_

    centers_list = [prev_centers, new_centers]
    args = [1]
    values = [clf.inertia_]
    while get_distance(prev_centers, new_centers) > eps:
        prev_centers = new_centers
        clf = KMeans(init=prev_centers,
                     n_clusters=n_clusters,
                     n_init=1,
                     n_jobs=-1,
                     tol=eps,
                     max_iter=1).fit(x_array)
        new_centers = clf.cluster_centers_
        args.append(len(args) + 1)
        values.append(clf.inertia_)
        centers_list.append(new_centers)
    # print "k = %s, len centers = %s" % (n_clusters, len(f_values))
    return args, values, centers_list
def performKmeans(data,n_clusters):
    
    print "Performing K-Means on data"
    est = KMeans(n_clusters)
    est.fit(data)
    orb_cb_handler.store_estimator(est)
    
    return est
def performKmeans(data,n_clusters):
    
    print "Performing K-Means on data"
    est = KMeans(n_clusters)
    est.fit(data)
    labels = est.labels_
    labels_np = np.array(labels)
    
    return labels,est
예제 #7
0
 def start_algorithm(self):
     """
     start clustering the stored tweets
     :return: list of clusters containing tweets
     """
     vectors = self.vectorize_data()
     kmeans = KMeans(init='k-means++', n_clusters=self.cluster_amount, n_init=10)
     kmeans.fit(vectors)
     return self.cluster_tweet(kmeans.labels_)
예제 #8
0
def performKmeans(data, n_clusters):

    print "Performing K-Means on data"
    est = KMeans(n_clusters)
    est.fit(data)
    labels = est.labels_
    labels_np = np.array(labels)

    return labels, est
예제 #9
0
def evaluate_kmeans_unsupervised(data, nclusters, k_init=20):
    """
    Clusters data with kmeans algorithm and then returns the cluster centroids
    :param data: Points that need to be clustered as a numpy array
    :param nclusters: Total number of clusters
    :param method_name: Name of the method from which the clustering space originates (only used for printing)
    :return: Formatted string containing metrics and method name, cluster centers
    """
    kmeans = KMeans(n_clusters=nclusters, n_init=k_init)
    kmeans.fit(data)
    return kmeans.cluster_centers_
예제 #10
0
 def start_algorithm(self):
     """
     start clustering the stored tweets
     :return: list of clusters containing tweets
     """
     vectors = self.vectorize_data()
     kmeans = KMeans(init='k-means++',
                     n_clusters=self.cluster_amount,
                     n_init=10)
     kmeans.fit(vectors)
     return self.cluster_tweet(kmeans.labels_)
def evaluateKMeans(data, labels, nclusters, method_name):
    '''
    Clusters data with kmeans algorithm and then returns the string containing method name and metrics, and also the evaluated cluster centers
    :param data: Points that need to be clustered as a numpy array
    :param labels: True labels for the given points
    :param nclusters: Total number of clusters
    :param method_name: Name of the method from which the clustering space originates (only used for printing)
    :return: Formatted string containing metrics and method name, cluster centers
    '''
    kmeans = KMeans(n_clusters=nclusters, n_init=20)
    kmeans.fit(data)
    return getClusterMetricString(method_name, labels, kmeans.labels_), kmeans.cluster_centers_
예제 #12
0
def _centroids(n_clusters: int,
               points: List[List[float]]) -> List[List[float]]:
    """ Return n_clusters centroids of points
    """

    k_means = KMeans(n_clusters=n_clusters)
    k_means.fit(points)

    closest, _ = pairwise_distances_argmin_min(k_means.cluster_centers_,
                                               points)

    return list(map(list, np.array(points)[closest.tolist()]))
예제 #13
0
def evaluateKMeans(data, labels, nclusters, method_name):
    '''
    Clusters data with kmeans algorithm and then returns the string containing method name and metrics, and also the evaluated cluster centers
    :param data: Points that need to be clustered as a numpy array
    :param labels: True labels for the given points
    :param nclusters: Total number of clusters
    :param method_name: Name of the method from which the clustering space originates (only used for printing)
    :return: Formatted string containing metrics and method name, cluster centers
    '''
    kmeans = KMeans(n_clusters=nclusters, n_init=20)
    kmeans.fit(data)
    return getClusterMetricString(method_name, labels,
                                  kmeans.labels_), kmeans.cluster_centers_
예제 #14
0
def extract_word_clusters(commentList, commentCount):
    brown_ic = wordnet_ic.ic('ic-brown.dat')
    a, corpus, global_synsets = extract_global_bag_of_words(commentList, True)
    similarity_dict = {}
    i = 0
    t = len(global_synsets)**2
    
    for syn_out in global_synsets:
        similarity_dict[syn_out] = {} 
        for syn_in in global_synsets:
            if syn_in.pos() == syn_out.pos():
                similarity_dict[syn_out][syn_in] = syn_out.lin_similarity(syn_in, brown_ic)
            else:
                similarity_dict[syn_out][syn_in] = max(wn.path_similarity(syn_out,syn_in), wn.path_similarity(syn_in,syn_out))
        
            if i % 10000 == 0:
                print i, 'synsets processed out of',len(global_synsets)**2, '(',float(i)/(t),'%)'
            i += 1

    tuples = [(i[0], i[1].values()) for i in similarity_dict.items()] 
    vectors = [np.array(tup[1]) for tup in tuples]

    
    # Rule of thumb
    n = sqrt(len(global_synsets)/2)
    print "Number of clusters", n
    km_model = KMeans(n_clusters=n)
    km_model.fit(vectors)
    
    clustering = collections.defaultdict(list)
    for idx, label in enumerate(km_model.labels_):
        clustering[label].append(tuples[idx][0])
        
    pprint.pprint(dict(clustering), width=1)
    
    feature_vector = np.zeros([len(corpus),n])
    
    for i,comment in enumerate(corpus):
        for w in comment:
            for key, clust in clustering.items():
                if w in clust:
                    feature_vector[i][key] += 1
        if i % 1000 == 0:
            print i, 'comments processed'
        
    print feature_vector
    '''
예제 #15
0
def runKMeans(distance_matrix, nClusters, number_of_threads):

    km = KMeans(n_clusters=nClusters,
                max_iter=100,
                init='k-means++',
                precompute_distances=True,
                n_jobs=number_of_threads)
    km.fit(distance_matrix)

    labels = km.labels_
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noises = list(labels).count(-1)

    print('Number of clusters' + str(n_clusters))
    print('Number of noises' + str(n_noises))

    return list(labels)
예제 #16
0
def evaluate_k_means_raw(data, true_labels, n_clusters, k_init):
    """
    Clusters data with K-Means algorithm and then returns clustering accuracy and NMI
    :param data: Points that need to be clustered as a numpy array
    :param true_labels: True labels for the given points
    :param n_clusters: Total number of clusters
    :return: ACC, NMI
    """
    # https://github.com/Datamine/MNIST-K-Means-Clustering/blob/master/Kmeans.ipynb
    # http://johnloeber.com/docs/kmeans.html
    # Llyod's Algorithm for K-Means Clustering

    kmeans = KMeans(n_clusters=n_clusters, n_init=k_init)
    kmeans.fit(data)
    acc = cluster_acc(true_labels, kmeans.labels_)
    nmi = metrics.normalized_mutual_info_score(true_labels, kmeans.labels_)
    return acc, nmi
예제 #17
0
def train_k_means(n_clusters, init_type, x_array, y, eps, n_init):
    DIGIT_COUNT = 10
    inertias = []
    iterations = []
    entropys = []
    for i in range(n_init):
        # fill matrix by zero
        n_matrix = np.zeros((n_clusters, DIGIT_COUNT), dtype=np.int)
        if init_type == "random":
            init = "random"
        elif init_type == "k-away":
            init = get_k_away_centers(x_array, n_clusters)
        else:
            raise NotImplementedError

        clf = KMeans(init=init,
                     n_clusters=n_clusters,
                     n_init=1,
                     n_jobs=-1,
                     tol=eps)
        clf.fit(x_array)
        # Q value
        inertias.append(clf.inertia_)
        # iterations number
        iterations.append(clf.n_iter_)
        # labels
        for j in range(len(y)):
            digit = y[j]
            cluster = clf.labels_[j]
            n_matrix[cluster][digit] += 1
        n = float(len(y))

        # print "n_matrix = ", [v for v in n_matrix]
        Hyz = -reduce(lambda s, p: s + (p * math.log(p, 2) if p > 0 else 0), [
            n_matrix[cluster][digit] / n for cluster in range(n_clusters)
            for digit in range(DIGIT_COUNT)
        ], 0.0)
        Hz = -reduce(
            lambda s, p: s + (p * math.log(p, 2) if p > 0 else 0),
            [sum(n_matrix[cluster], 0.0) / n
             for cluster in range(n_clusters)], 0.0)
        # print("Hyz = %s" % Hyz)
        # print("Hz = %s" % Hz)
        entropys.append(Hyz - Hz)
    return iterations, inertias, entropys
예제 #18
0
    def create_train_kmeans(data, number_of_clusters=len(codes)):
        # n_jobs is set to -1 to use all available CPU cores. This makes a big difference on an 8-core CPU
        # especially when the data size gets much bigger. #perfMatters

        k = KMeans(n_clusters=number_of_clusters, n_jobs=-1, random_state=728)
        # Let's do some timings to see how long it takes to train.
        start = time.time()

        # Train it up
        k.fit(data)

        # Stop the timing
        end = time.time()

        # And see how long that took
        print("Training took {} seconds".format(end - start))

        return k
def test_cifar10():

    (X_train, y_train), (X_test, y_test) = cifar10.load_data()
    X_train = X_train.reshape((50000, 32*32*3))
    X_test  = X_test.reshape((10000, 32*32*3))
    y_train = y_train.reshape((50000))
    y_test  = y_test.reshape((10000))


    distortions = []
    X_ = X_test[y_test==4]
    K = range(1,30)
    for k in K:
        kmeanModel = KMeans(n_clusters=k, random_state=84)
        kmeanModel.fit(X_)
        distortions.append(sum(np.min(cdist(X_, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X_.shape[0])

    # Plot the elbow
    plt.plot(K, distortions, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Distortion')
    plt.title('The Elbow Method showing the optimal k')
    plt.show()


    #alg = LogisticRegressionCV(Cs=[1], multi_class='ovr', n_jobs=-1, random_state=84)
    #alg.fit(X_train, y_train)
    #y_pred = alg.predict(X_test)
    #score = accuracy_score(y_test, y_pred)
    #print(score)

    pl = PluralizatorClassifier(
              LogisticRegressionCV(Cs=[1], multi_class='ovr', n_jobs=-1, random_state=84),
              'k-means',
              { 0:3, 1:3, 2:3, 3:3, 4:3, 5:3, 6:3, 7:3, 8:3, 9:3 },
              random_state=84,
              n_jobs=-1)
    pl.fit(X_train, y_train)
    y_pred = pl.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    print(score)

    return
예제 #20
0
def run_kmeans(data,label,k=3,fname="../results/kmeans"):
    if len(data) < k:
        return
    vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000,stop_words='english', use_idf=True)
    clean_data = get_clean_data(data)
    X = vectorizer.fit_transform(clean_data)
    km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
    km.fit(X)
    print label,np.bincount(km.labels_)
    assert len(km.labels_) == len(data)
    f = open(fname+str(int(label))+".csv",'w')
    f.write("subject\tbody\tcluster_id\n")
    for i in range(len(data)):
        subject,body = data[i]
        subject  = " ".join(str(subject).split())
        body  = " ".join(str(body).split())
        cluster_id = str(km.labels_[i])
        row = data[i]
        f.write(subject+"\t"+body+"\t"+cluster_id+'\n')
    f.close()
예제 #21
0
 def ClusterBalance(self, indexesToPick, stopCount, kmeansFlag=True):
     print "ClusterBalancing..."
     indexesPicked = []
     obs1 = self.observations[indexesToPick]
     obs = normalize(obs1, axis=0)
     if len(indexesToPick) != 0:
         if kmeansFlag:
             if (len(indexesToPick) < self.numClusters):
                 cluster = KMeans(init='k-means++',
                                  n_clusters=len(obs),
                                  n_init=10)
             else:
                 cluster = KMeans(init='k-means++',
                                  n_clusters=self.numClusters,
                                  n_init=10)
         else:
             if (len(indexesToPick) < self.numClusters):
                 cluster = spectral_clustering(n_clusters=len(obs),
                                               n_init=10)
             else:
                 cluster = spectral_clustering(n_clusters=self.numClusters,
                                               n_init=10)
         cluster.fit(obs)
         labels = cluster.labels_
         whenToStop = max(2, stopCount)
         count = 0
         while count != whenToStop:
             cluster_list = range(self.numClusters)
             index = 0
             for j in labels:
                 if j in cluster_list:
                     indexesPicked.append(indexesToPick[index])
                     cluster_list.remove(j)
                     count += 1
                     if count == whenToStop:
                         break
                     labels[index] = -1
                     if len(cluster_list) == 0:
                         break
                 index += 1
     return indexesPicked
예제 #22
0
파일: k_means.py 프로젝트: sreev/lale
class KMeansImpl():
    def __init__(self,
                 n_clusters=8,
                 init='k-means++',
                 n_init=10,
                 max_iter=300,
                 tol=0.0001,
                 precompute_distances='auto',
                 verbose=0,
                 random_state=None,
                 copy_x=True,
                 n_jobs=None,
                 algorithm='auto'):
        self._hyperparams = {
            'n_clusters': n_clusters,
            'init': init,
            'n_init': n_init,
            'max_iter': max_iter,
            'tol': tol,
            'precompute_distances': precompute_distances,
            'verbose': verbose,
            'random_state': random_state,
            'copy_x': copy_x,
            'n_jobs': n_jobs,
            'algorithm': algorithm
        }
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)

    def predict(self, X):
        return self._wrapped_model.predict(X)
예제 #23
0
def run_kmeans(data, label, k=3, fname="../results/kmeans"):
    if len(data) < k:
        return
    vectorizer = TfidfVectorizer(max_df=0.5,
                                 max_features=10000,
                                 stop_words='english',
                                 use_idf=True)
    clean_data = get_clean_data(data)
    X = vectorizer.fit_transform(clean_data)
    km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
    km.fit(X)
    print label, np.bincount(km.labels_)
    assert len(km.labels_) == len(data)
    f = open(fname + str(int(label)) + ".csv", 'w')
    f.write("subject\tbody\tcluster_id\n")
    for i in range(len(data)):
        subject, body = data[i]
        subject = " ".join(str(subject).split())
        body = " ".join(str(body).split())
        cluster_id = str(km.labels_[i])
        row = data[i]
        f.write(subject + "\t" + body + "\t" + cluster_id + '\n')
    f.close()
    def get_data_for_kl_loss(self, encode_output, label_list, n_clusters):
        """
        returns centroids for KL-divergence loss
        :param encode_output: encoder output
        :param label_list: labels for the encoder output
        :param n_clusters: number of clusters
        :return: centroids
        """

        # if self.use_cuda is False:
        #     data = np.copy(encode_output.data)
        #     label = np.copy(label_list.data)
        # else:
        #     data = np.copy(encode_output.data.cpu())
        #     label = np.copy(label_list.data.cpu())

        data = encode_output
        data_len = len(data)

        if data_len < n_clusters:
            n_clusters = data_len

        kmeans = KMeans(init='k-means++',
                        n_clusters=n_clusters,
                        n_init=self.k_init)

        # Fitting the input data
        kmeans.fit(data)

        # Centroid values
        centroids = kmeans.cluster_centers_

        if self.use_cuda:
            return Variable(torch.from_numpy(centroids).float().cuda())

        return Variable(torch.from_numpy(centroids).float())
예제 #25
0
 def train(self, X):
     clf = KMeans(self.n_cluters)
     s = clf.fit(X)
     return clf, s
                                                 y_train,
                                                 cv=kfold,
                                                 scoring="accuracy")
    results.append(cv_results)
    names.append(name)
    print("%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()))

print('***********KNN**************')
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
predictions = knn.predict(x_test)

print(accuracy_score(y_test, predictions))
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

print('***********PCA**************')

pca = PCA()
X_train1 = pca.fit_transform(x_train)
X_validation1 = pca.transform(x_test)
#print(pca.explained_variance_ratio_)  #her bileşen için varyans değerlerni gösteriyor
print('************************')
print("\n\n", X_train1)
print('*******KMEANS*************')

kmeans = KMeans(n_clusters=5)
kmeans.fit(X, Y)
#print(kmeans.cluster_centers_)
print(pd.crosstab(Y, kmeans.labels_))
예제 #27
0
        else:
            tweet_topic.append(lda_model(text))
            users_cluster.append(users)
            users = []
            count = count + 1
            #for new users
            users.append(cluster[2])
            text = cluster[1]

    tweet_topic.append(lda_model(text))
    return cluster_data, tweet_topic, users_cluster


print('getting dataframe')
data = get_dataframe()
user_handle = list(data['user_handle'])
print('getting final matrix')
final_matrix, count, tweet_data = get_vector(data)
joblib.dump(final_matrix, home + '/../../data/final_matrix.txt')
print('Applying clustering')
sse = dict()
for k in range(1, 25):
    kmeans = KMeans(n_clusters=k, max_iter=100, random_state=0)
    clus = kmeans.fit(final_matrix)
    sse[k] = clus.inertia_
print('20 clusters are ready')
cluster_data, tweet_topic, users_cluster = get_cluster_kmeans(count)
joblib.dump(tweet_topic, home + '/../../data/tweet_topic.txt')
joblib.dump(users_cluster, home + '/../../data/users_cluster.txt')
joblib.dump(sse, home + '/../../data/sse.txt')
예제 #28
0
import numpy
import os
from sklearn.cluster.k_means_ import KMeans
import cPickle
import sys

# Performs K-means clustering and save the model to a local file

if __name__ == "__main__":
    if len(sys.argv) != 4:
        print "Usage: {0} sift_file cluster_num output_file".format(sys.argv[0])
        print "sift_file -- path to the sift file"
        print "cluster_num -- number of cluster"
        print "output_file -- path to save the k-means model"
        exit(1)

    sift_file = sys.argv[1]
    output_file = sys.argv[3]
    cluster_num = int(sys.argv[2])
    # Read data
    X = numpy.genfromtxt(sift_file, delimiter=";")
    # Fit model
    estimator = KMeans(n_clusters=cluster_num)
    estimator.fit(X)
    # Dump model
    with open(output_file, "wb") as f:
        cPickle.dump(estimator, f)

    print "K-means trained successfully!"
예제 #29
0
#!/bin/python 

import numpy
import os
from sklearn.cluster.k_means_ import KMeans
import cPickle
import sys
import pandas as pd

# Performs K-means clustering and save the model to a local file

if __name__ == '__main__':
    if len(sys.argv) != 4:
        print "Usage: {0} mfcc_csv_file cluster_num output_file".format(sys.argv[0])
        print "mfcc_csv_file -- path to the mfcc csv file"
        print "cluster_num -- number of cluster"
        print "output_file -- path to save the k-means model"
        exit(1)

    mfcc_csv_file = sys.argv[1]; output_file = sys.argv[3]
    cluster_num = int(sys.argv[2])
    df = pd.read_csv(mfcc_csv_file, delimiter=';', header=None)
    kmeans = KMeans(n_clusters=cluster_num, max_iter=300, verbose=1, n_jobs=-1)
    model = kmeans.fit(df)
    cPickle.dump(model, open(output_file, "wb"))
    print "K-means trained successfully!"
예제 #30
0
from sklearn.datasets.samples_generator import make_blobs

np.random.seed(0)

batch_size = 45
centers = [[1, 1], [-1, -1], [1, -1]]
n_clusters = len(centers)
X, labels_true = make_blobs(n_samples=1000, centers=centers, cluster_std=0.4)

# Draw randoms
indxs = np.arange(1000)
np.random.shuffle(indxs)
centroids = X[indxs[:3]]

k_means = KMeans(k=3, max_iter=1, init=centroids)
k_means.fit(X)
k_means_labels1 = k_means.labels_
k_means_cluster_centers1 = k_means.cluster_centers_

k_means = KMeans(k=3, max_iter=2, init=centroids)
k_means.fit(X)
k_means_labels2 = k_means.labels_
k_means_cluster_centers2 = k_means.cluster_centers_

k_means = KMeans(k=3, max_iter=3, init=centroids)
k_means.fit(X)
k_means_labels3 = k_means.labels_
k_means_cluster_centers3 = k_means.cluster_centers_

k_means = KMeans(k=3, max_iter=4, init=centroids)
k_means.fit(X)
예제 #31
0
from sklearn.cluster.k_means_ import KMeans
import _pickle as cPickle
import sys

# Performs K-means clustering and save the model to a local file

if __name__ == '__main__':
    if len(sys.argv) != 4:
        print("Usage: {0} mfcc_csv_file cluster_num output_file".format(
            sys.argv[0]))
        print("mfcc_csv_file -- path to the mfcc csv file")
        print("cluster_num -- number of cluster")
        print("output_file -- path to save the k-means model")
        exit(1)

    mfcc_csv_file = sys.argv[1]
    output_file = sys.argv[3]
    cluster_num = int(sys.argv[2])

    #data = numpy.genfromtxt(mfcc_csv_file, delimiter=";")
    #data = numpy.loadtxt(mfcc_csv_file, delimiter=";")
    #data = pandas.io.parsers.read_csv(mfcc_csv_file, sep=";")
    data = pandas.read_csv(mfcc_csv_file, sep=';')
    #data = numpy.genfromtxt(mfcc_csv_file,dtype=numpy.float64, delimiter=";")

    model = KMeans(n_clusters=cluster_num, n_jobs=5)
    model.fit(data)
    cPickle.dump(model, open(output_file, 'wb'))

    print("K-means trained successfully!")
예제 #32
0
if __name__ == '__main__':
    if len(sys.argv) != 4:
        print("Usage: {0} mfcc_csv_file cluster_num output_file".format(
            sys.argv[0]))
        print("mfcc_csv_file -- path to the mfcc csv file")
        print("cluster_num -- number of cluster")
        print("output_file -- path to save the k-means model")
        exit(1)

    # read cmd lime args
    mfcc_csv_file = sys.argv[1]
    cluster_num = int(sys.argv[2])
    output_file = sys.argv[3]

    # load mfcc features
    mfcc_features = numpy.genfromtxt(mfcc_csv_file,
                                     dtype=numpy.float32,
                                     delimiter=";")

    # create and execute k-means clustering
    km_model = KMeans(n_clusters=cluster_num, n_jobs=2)
    km_model.fit(mfcc_features)
    print("K-means trained successfully!")

    # save model
    out_fd = open(output_file, "wb")
    cPickle.dump(km_model, out_fd)  # cPickle.HIGHEST_PROTOCOL needed?
    out_fd.close()
    print("K-means saved successfully!")
예제 #33
0
# Performs K-means clustering and save the model to a local file

if __name__ == '__main__':
    if len(sys.argv) != 4:
        print "Usage: {0} mfcc_csv_file cluster_num output_file".format(
            sys.argv[0])
        print "mfcc_csv_file -- path to the mfcc csv file"
        print "cluster_num -- number of cluster"
        print "output_file -- path to save the k-means model"
        exit(1)

    mfcc_csv_file = sys.argv[1]
    output_file = sys.argv[3]
    cluster_num = int(sys.argv[2])

    # kmeans code here =====
    #mfcc_array = np.loadtxt(mfcc_csv_file, delimiter = ';', dtype = 'float64')
    mfcc_array = pd.read_csv(mfcc_csv_file,
                             sep=';',
                             header=None,
                             dtype='float64')

    kmeans = KMeans(n_clusters=cluster_num)
    kmeans.fit(mfcc_array)

    #    with open(output_file,'wb') as fp:
    #        cPickle.dump(kmeans,fp)
    cPickle.dump(kmeans, open(output_file, "wb"))

    print "K-means trained successfully!"
class KMeansEstimator:
    """
    This class reads the tweets of users from a file and builds cluster centers on that data. It also provides
    method for finding the closest cluster center of unseen data.
    """
    
    ADJECTIVE = 'JJ'
    
    """
    Feature keys used in clustering...
    """
    POLARITY_FEATURE_KEY = 'polarity'
    SUBJECTIVITY_FEATURE_KEY = 'subjectivity'
    TWEET_COUNT_FEATURE_KEY = 'tweetCount'
    """
    Features not considered for clustering...
    """
    USER_ID_FEATURE_KEY = 'userId'
    LONGITUDE_FEATURE_KEY = 'longitude'
    LATITUDE_FEATURE_KEY = 'latitude'
    
    
    """
    Predicted label feature name.
    """
    LABEL_FEATURE_KEY = 'label'

    RELEVENT_FEATURE_LIST = [USER_ID_FEATURE_KEY, LATITUDE_FEATURE_KEY, LONGITUDE_FEATURE_KEY, LABEL_FEATURE_KEY]
    
    def __init__(self, tweet_file_path, no_of_clusters):
        """
        The constructor reads csv file and builds the data matrix.
        """
        self.np_extractor = ConllExtractor()
        self.pos_tagger = NLTKTagger()
        self.tweet_file_path = tweet_file_path
        self.data_matrix = self.__get_data_matrix_from_file(tweet_file_path)
        self.vectorizer = DictVectorizer(sparse=True)
        self.k_means_estimator = KMeans(init="random", n_clusters=no_of_clusters)
        
    @time_it
    def __get_data_matrix_from_file(self, tweet_file_path):
        """
        Reads tweets from csv file at path "tweet_file_path", extracts features from the tweets and returns list
        of all feature vectors.
        """
        file_reader = csv.reader(open(tweet_file_path, "rb"), delimiter=',')
        next(file_reader)
        data_matrix = []
        for row in file_reader:
            logging.info("Extracting features for user_id:%s", row[0])
            feature_vector = {}
            feature_vector[self.USER_ID_FEATURE_KEY] = int(row[0])
            feature_vector[self.LATITUDE_FEATURE_KEY] = float(row[2])
            feature_vector[self.LONGITUDE_FEATURE_KEY] = float(row[3])
            feature_vector[self.TWEET_COUNT_FEATURE_KEY] = int(row[4])
            feature_vector.update(self.__get_features_from_tweet_text(row[1].decode('utf-8')))
            data_matrix.append(feature_vector)
            logging.info("Successfully extracted features for user_id:%s", row[0])
        return data_matrix
    
    @time_it
    def __get_features_from_tweet_text(self, tweet_text):
        """This function returns the following features from the tweet text:
        - Adjectives and their corresponding frequencies found in the tweet. Each adjective is a separate feature.
        - Subjectivity and polarity as determined by TextBlob.
        :returns:  (key,value) map of all features found. 
        """
        text_blob = TextBlob(tweet_text, np_extractor=self.np_extractor, pos_tagger=self.pos_tagger);
        adjective_map = dict(Counter((ele[0] for ele in set(text_blob.pos_tags) if ele[1] == self.ADJECTIVE)))
        polarity = text_blob.sentiment[0]
        subjectivity = text_blob.sentiment[1]
        return dict(adjective_map.items() + {self.POLARITY_FEATURE_KEY:polarity, self.SUBJECTIVITY_FEATURE_KEY:subjectivity}.items())
    
    @time_it
    def __get_clustering_data_matrix(self, data_matrix):
        """
        This method removes unnecessary features(features like user_id which are not relevant for building cluster centers) from
        the data matrix and returns a copy of the data matrix.
        """
        data_matrix_copy = copy.deepcopy(data_matrix)
        for feature_vector in data_matrix_copy:
            feature_vector.pop(self.USER_ID_FEATURE_KEY)
            feature_vector.pop(self.LATITUDE_FEATURE_KEY)
            feature_vector.pop(self.LONGITUDE_FEATURE_KEY)
        return data_matrix_copy


    @time_it
    def perform_clustering(self, features_to_include=None):
        """
        This function performs k-means clustering with "no_of_clusters" clusters of the data present in file at
        "tweet_file_path".
        It returns list of feature vector, where each feature vector contains only "features_to_include" or all features
        if "features_to_include" is None.
        """
        clustering_data_matrix = self.__get_clustering_data_matrix(self.data_matrix)
        transformed_data_matrix = self.vectorizer.fit_transform(clustering_data_matrix)
        
        self.k_means_estimator.fit(transformed_data_matrix, y=None)
        return self.__get_predicted_labels(self.data_matrix, features_to_include)

    @time_it    
    def __get_predicted_labels(self, data_matrix, features_to_include):
        """
        Finds the nearest cluster for all data points and adds a new feature label in all feature vectors of data matrix. The
        data matrix is modified in place.
        It returns a new copy of data_matrix with "features_to_include" features.
        """
        feature_names = self.vectorizer.get_feature_names()
        for feature_vector in data_matrix:
            row = [0] * len(feature_names)
            column = range(len(feature_names))
            data = map(lambda feature_name:feature_vector[feature_name] if feature_name in feature_vector else 0, feature_names)
            feature_csr_matrix = csr_matrix(coo_matrix((data, (row, column))))
            predicted_label = self.k_means_estimator.predict(feature_csr_matrix)
            feature_vector[self.LABEL_FEATURE_KEY] = predicted_label[0]
        
        expanded_data_matrix = self.__get_expanded_data_matrix(data_matrix)
        if features_to_include:
            return self.__get_filtered_data_matrix(expanded_data_matrix, features_to_include)
        else:
            return expanded_data_matrix

    @time_it
    def __get_filtered_data_matrix(self, data_matrix, features_to_include):
        """
        Removes all features except features_to_include
        """
        filtered_data_matrix = []
        for feature_vector in data_matrix:
            filtered_feature_vector = {}
            for feature_name in features_to_include:
                filtered_feature_vector[feature_name] = feature_vector[feature_name]
            filtered_data_matrix.append(filtered_feature_vector)
        return filtered_data_matrix
    
    @time_it
    def __get_expanded_data_matrix(self, data_matrix):
        """
        Adds new keys for missing features to all feature vectors of data_matrix. The data matrix is not modified, but a new 
        modified copy is returned.
        """
        feature_names = self.vectorizer.get_feature_names()
        expanded_data_matrix = copy.deepcopy(data_matrix)
        for feature_vector in expanded_data_matrix:
            for feature_name in feature_names:
                if feature_name not in feature_vector:
                    feature_vector[feature_name] = 0
        return expanded_data_matrix
    
    @time_it
    def predict_labels_for_data(self, file_path, features_to_include=None):
        """
        This function reads the tweets of different users from the file at file_path and assigns the closest 
        cluster center to each user.
        It returns list of tuples of (user_id,predicted_label,latitude, longitude).
        """
        data_matrix = self.__get_data_matrix_from_file(file_path)
        return self.__get_predicted_labels(data_matrix, features_to_include)
예제 #35
0
    v0,
    v1,
    arrowprops=arrowprops,
)
plt.text(v1[0], v1[1], "PC2")

# Eksenleri eşit göstermemiz önemli !!!
# Yoksa PC'lerin birbirine dik olduğunu şekilden anlayamazsınız !!!
plt.axis("equal")

plt.figure()
plt.scatter(X_new[:, 0], X_new[:, 1], alpha=0.2)
# ilk eksende PC1 olduğu için
plt.scatter(X_new[:, 0], np.zeros(X_new[:, 0].shape), alpha=0.1, color='r')

# Projeksiyon çizgilerini de çizdirelim
for i in range(len(X_new[:, 0])):
    plt.plot([X_new[i, 0], X_new[i, 0]], [X_new[i, 1], 0],
             color="deeppink",
             alpha=0.1)

plt.xlabel("PC1")
plt.ylabel("PC2")

plt.axis("equal")
from sklearn.cluster.k_means_ import KMeans

kmeans = KMeans(n_clusters=3)
kmeans.fit(x, y)
print(kmeans.cluster_centers_)
print(pd.crosstab(y, kmeans.labels_))
import numpy as np
import os
from sklearn.cluster.k_means_ import KMeans
import cPickle
import sys
import pickle
# Performs K-means clustering and save the model to a local file

if __name__ == '__main__':
    if len(sys.argv) != 4:
        print "Usage: {0} mfcc_csv_file cluster_num output_file".format(
            sys.argv[0])
        print "mfcc_csv_file -- path to the mfcc csv file"
        print "cluster_num -- number of cluster"
        print "output_file -- path to save the k-means model"
        exit(1)

    mfcc_csv_file = sys.argv[1]
    output_file = sys.argv[3]
    cluster_num = int(sys.argv[2])

    mfcc_vectors = np.genfromtxt(mfcc_csv_file, delimiter=";")
    kmeans_model = KMeans(n_clusters=cluster_num,
                          init='k-means++',
                          n_init=10,
                          verbose=1)
    kmeans_model.fit(mfcc_vectors)
    pickle.dump(kmeans_model, open(output_file + '.pickle', 'wb'))

    print "K-means trained successfully!"
예제 #37
0
from sklearn.datasets.samples_generator import make_blobs

np.random.seed(0)

batch_size = 45
centers = [[1, 1], [-1, -1], [1, -1]]
n_clusters = len(centers)
X, labels_true = make_blobs(n_samples=1000, centers=centers, cluster_std=0.4)

# Draw randoms
indxs = np.arange(1000)
np.random.shuffle(indxs)
centroids = X[indxs[:3]]

k_means = KMeans(k=3, max_iter=1, init=centroids)
k_means.fit(X)
k_means_labels1 = k_means.labels_
k_means_cluster_centers1 = k_means.cluster_centers_

k_means = KMeans(k=3, max_iter=2, init=centroids)
k_means.fit(X)
k_means_labels2 = k_means.labels_
k_means_cluster_centers2 = k_means.cluster_centers_

k_means = KMeans(k=3, max_iter=3, init=centroids)
k_means.fit(X)
k_means_labels3 = k_means.labels_
k_means_cluster_centers3 = k_means.cluster_centers_

k_means = KMeans(k=3, max_iter=4, init=centroids)
k_means.fit(X)
예제 #38
0
class KMeansEstimator:
    """
    This class reads the tweets of users from a file and builds cluster centers on that data. It also provides
    method for finding the closest cluster center of unseen data.
    """

    ADJECTIVE = 'JJ'
    """
    Feature keys used in clustering...
    """
    POLARITY_FEATURE_KEY = 'polarity'
    SUBJECTIVITY_FEATURE_KEY = 'subjectivity'
    TWEET_COUNT_FEATURE_KEY = 'tweetCount'
    """
    Features not considered for clustering...
    """
    USER_ID_FEATURE_KEY = 'userId'
    LONGITUDE_FEATURE_KEY = 'longitude'
    LATITUDE_FEATURE_KEY = 'latitude'
    """
    Predicted label feature name.
    """
    LABEL_FEATURE_KEY = 'label'

    RELEVENT_FEATURE_LIST = [
        USER_ID_FEATURE_KEY, LATITUDE_FEATURE_KEY, LONGITUDE_FEATURE_KEY,
        LABEL_FEATURE_KEY
    ]

    def __init__(self, tweet_file_path, no_of_clusters):
        """
        The constructor reads csv file and builds the data matrix.
        """
        self.np_extractor = ConllExtractor()
        self.pos_tagger = NLTKTagger()
        self.tweet_file_path = tweet_file_path
        self.data_matrix = self.__get_data_matrix_from_file(tweet_file_path)
        self.vectorizer = DictVectorizer(sparse=True)
        self.k_means_estimator = KMeans(init="random",
                                        n_clusters=no_of_clusters)

    @time_it
    def __get_data_matrix_from_file(self, tweet_file_path):
        """
        Reads tweets from csv file at path "tweet_file_path", extracts features from the tweets and returns list
        of all feature vectors.
        """
        file_reader = csv.reader(open(tweet_file_path, "rb"), delimiter=',')
        next(file_reader)
        data_matrix = []
        for row in file_reader:
            logging.info("Extracting features for user_id:%s", row[0])
            feature_vector = {}
            feature_vector[self.USER_ID_FEATURE_KEY] = int(row[0])
            feature_vector[self.LATITUDE_FEATURE_KEY] = float(row[2])
            feature_vector[self.LONGITUDE_FEATURE_KEY] = float(row[3])
            feature_vector[self.TWEET_COUNT_FEATURE_KEY] = int(row[4])
            feature_vector.update(
                self.__get_features_from_tweet_text(row[1].decode('utf-8')))
            data_matrix.append(feature_vector)
            logging.info("Successfully extracted features for user_id:%s",
                         row[0])
        return data_matrix

    @time_it
    def __get_features_from_tweet_text(self, tweet_text):
        """This function returns the following features from the tweet text:
        - Adjectives and their corresponding frequencies found in the tweet. Each adjective is a separate feature.
        - Subjectivity and polarity as determined by TextBlob.
        :returns:  (key,value) map of all features found. 
        """
        text_blob = TextBlob(tweet_text,
                             np_extractor=self.np_extractor,
                             pos_tagger=self.pos_tagger)
        adjective_map = dict(
            Counter((ele[0] for ele in set(text_blob.pos_tags)
                     if ele[1] == self.ADJECTIVE)))
        polarity = text_blob.sentiment[0]
        subjectivity = text_blob.sentiment[1]
        return dict(
            adjective_map.items() + {
                self.POLARITY_FEATURE_KEY: polarity,
                self.SUBJECTIVITY_FEATURE_KEY: subjectivity
            }.items())

    @time_it
    def __get_clustering_data_matrix(self, data_matrix):
        """
        This method removes unnecessary features(features like user_id which are not relevant for building cluster centers) from
        the data matrix and returns a copy of the data matrix.
        """
        data_matrix_copy = copy.deepcopy(data_matrix)
        for feature_vector in data_matrix_copy:
            feature_vector.pop(self.USER_ID_FEATURE_KEY)
            feature_vector.pop(self.LATITUDE_FEATURE_KEY)
            feature_vector.pop(self.LONGITUDE_FEATURE_KEY)
        return data_matrix_copy

    @time_it
    def perform_clustering(self, features_to_include=None):
        """
        This function performs k-means clustering with "no_of_clusters" clusters of the data present in file at
        "tweet_file_path".
        It returns list of feature vector, where each feature vector contains only "features_to_include" or all features
        if "features_to_include" is None.
        """
        clustering_data_matrix = self.__get_clustering_data_matrix(
            self.data_matrix)
        transformed_data_matrix = self.vectorizer.fit_transform(
            clustering_data_matrix)

        self.k_means_estimator.fit(transformed_data_matrix, y=None)
        return self.__get_predicted_labels(self.data_matrix,
                                           features_to_include)

    @time_it
    def __get_predicted_labels(self, data_matrix, features_to_include):
        """
        Finds the nearest cluster for all data points and adds a new feature label in all feature vectors of data matrix. The
        data matrix is modified in place.
        It returns a new copy of data_matrix with "features_to_include" features.
        """
        feature_names = self.vectorizer.get_feature_names()
        for feature_vector in data_matrix:
            row = [0] * len(feature_names)
            column = range(len(feature_names))
            data = map(
                lambda feature_name: feature_vector[feature_name]
                if feature_name in feature_vector else 0, feature_names)
            feature_csr_matrix = csr_matrix(coo_matrix((data, (row, column))))
            predicted_label = self.k_means_estimator.predict(
                feature_csr_matrix)
            feature_vector[self.LABEL_FEATURE_KEY] = predicted_label[0]

        expanded_data_matrix = self.__get_expanded_data_matrix(data_matrix)
        if features_to_include:
            return self.__get_filtered_data_matrix(expanded_data_matrix,
                                                   features_to_include)
        else:
            return expanded_data_matrix

    @time_it
    def __get_filtered_data_matrix(self, data_matrix, features_to_include):
        """
        Removes all features except features_to_include
        """
        filtered_data_matrix = []
        for feature_vector in data_matrix:
            filtered_feature_vector = {}
            for feature_name in features_to_include:
                filtered_feature_vector[feature_name] = feature_vector[
                    feature_name]
            filtered_data_matrix.append(filtered_feature_vector)
        return filtered_data_matrix

    @time_it
    def __get_expanded_data_matrix(self, data_matrix):
        """
        Adds new keys for missing features to all feature vectors of data_matrix. The data matrix is not modified, but a new 
        modified copy is returned.
        """
        feature_names = self.vectorizer.get_feature_names()
        expanded_data_matrix = copy.deepcopy(data_matrix)
        for feature_vector in expanded_data_matrix:
            for feature_name in feature_names:
                if feature_name not in feature_vector:
                    feature_vector[feature_name] = 0
        return expanded_data_matrix

    @time_it
    def predict_labels_for_data(self, file_path, features_to_include=None):
        """
        This function reads the tweets of different users from the file at file_path and assigns the closest 
        cluster center to each user.
        It returns list of tuples of (user_id,predicted_label,latitude, longitude).
        """
        data_matrix = self.__get_data_matrix_from_file(file_path)
        return self.__get_predicted_labels(data_matrix, features_to_include)