示例#1
0
 def ClusterBalance(self, indexesToPick, stopCount, kmeansFlag=True):
     print "ClusterBalancing..."
     indexesPicked = []
     obs1 = self.observations[indexesToPick]
     obs = normalize(obs1, axis=0)
     if len(indexesToPick) != 0:
         if kmeansFlag:
             if(len(indexesToPick) < self.numClusters):
                 cluster = KMeans(init='k-means++', n_clusters=len(obs), n_init=10)
             else:
                 cluster = KMeans(init='k-means++', n_clusters=self.numClusters, n_init=10)
         else:
             if(len(indexesToPick) < self.numClusters):
                 cluster = spectral_clustering(n_clusters=len(obs), n_init=10)
             else:
                 cluster = spectral_clustering(n_clusters=self.numClusters, n_init=10)
         cluster.fit(obs)
         labels = cluster.labels_
         whenToStop = max(2, stopCount)
         count = 0
         while count != whenToStop:
             cluster_list = range(self.numClusters)
             index = 0
             for j in labels:
                 if j in cluster_list:
                     indexesPicked.append(indexesToPick[index])
                     cluster_list.remove(j)
                     count += 1
                     if count == whenToStop:
                         break
                     labels[index] = -1
                     if len(cluster_list) == 0:
                         break
                 index += 1
     return indexesPicked
示例#2
0
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading data')
    data = np.loadtxt(args.data_points)

    if args.root is not None:
        data = np.sqrt(data)

    (k, initial_points) = get_initial_centers(args.clusters, args.start_points)

    log.info('calculate center points')
    kmeans = KMeans(k, initial_points, 1, args.max_iter, copy_x=False)
    predict = kmeans.fit_predict(data)

    log.info('storing results')

    if args.model:
        save_object_to_file(kmeans, args.model)

    with utf8_file_open(args.outfile, 'w') as outfile:

        for i in range(predict.shape[0]):
            outfile.write('%d\n' % predict[i])

    if args.centroids:
        np.savetxt(args.centroids, kmeans.cluster_centers_)

    log.info('finished')
def performKmeans(data,n_clusters):
    
    print "Performing K-Means on data"
    est = KMeans(n_clusters)
    est.fit(data)
    orb_cb_handler.store_estimator(est)
    
    return est
示例#4
0
 def start_algorithm(self):
     """
     start clustering the stored tweets
     :return: list of clusters containing tweets
     """
     vectors = self.vectorize_data()
     kmeans = KMeans(init='k-means++', n_clusters=self.cluster_amount, n_init=10)
     kmeans.fit(vectors)
     return self.cluster_tweet(kmeans.labels_)
def performKmeans(data,n_clusters):
    
    print "Performing K-Means on data"
    est = KMeans(n_clusters)
    est.fit(data)
    labels = est.labels_
    labels_np = np.array(labels)
    
    return labels,est
def evaluateKMeans(data, labels, nclusters, method_name):
    '''
    Clusters data with kmeans algorithm and then returns the string containing method name and metrics, and also the evaluated cluster centers
    :param data: Points that need to be clustered as a numpy array
    :param labels: True labels for the given points
    :param nclusters: Total number of clusters
    :param method_name: Name of the method from which the clustering space originates (only used for printing)
    :return: Formatted string containing metrics and method name, cluster centers
    '''
    kmeans = KMeans(n_clusters=nclusters, n_init=20)
    kmeans.fit(data)
    return getClusterMetricString(method_name, labels, kmeans.labels_), kmeans.cluster_centers_
示例#7
0
    def get_params(self, deep=True):
        """Overrides superclass get_params() to allow superclass hyperparameters
          to be returned as well.

          This allows for tuning any parameters from the parent KMeans class
          without having to list each parameter in the KMeansMM __init__() function.
          Taken from https://stackoverflow.com/questions/51430484/how-to-subclass-a-vectorizer-in-scikit-learn-without-repeating-all-parameters-in.

        Parameters
        ----------
        deep : boolean, optional
            If True, will return the parameters for this estimator and
            contained subobjects that are estimators.

        Returns
        -------
        params : mapping of string to any
            Parameter names mapped to their values.

        """

        params = super().get_params(deep)
        cp = copy.copy(self)
        cp.__class__ = KMeans
        params.update(KMeans.get_params(cp, deep))
        return params
示例#8
0
def initial_kmeans(k, rand_state, data, reallabels):
    min_clusters, max_clusters = k_range(k)  # 根据真实类标签数得到实验所用的簇数量范围
    bestAri_arr = []  # 每一个k簇值ari最好值的集合
    # bestCr_arr = [] #每一个k簇值CR最好值的集合
    kmeans_labels = []  # 某一k簇值得到的最好的划分
    kmeans_labels_arr = []  # 每一个k簇值的最好划分的集合
    for clusters in range(min_clusters, max_clusters):
        bestAri = 0  # 某一k簇值中的ari最好值
        # bestCr = -1
        for i in range(ini_generation):
            y_kmeans = KMeans(n_clusters=clusters,
                              random_state=rand_state).fit_predict(data)
            kmeans_ari = adjusted_rand_score(reallabels, y_kmeans)
            # kmeans_cr = corrected_rand(reallabels, y_kmeans)
            if kmeans_ari > bestAri:
                bestAri = kmeans_ari
                kmeans_labels = y_kmeans
            # if kmeans_cr > bestCr:
            #     bestCr = kmeans_cr
        # bestCr_arr.append(bestCr)
        bestAri_arr.append(bestAri)
        ind_kmeans = creator.Individual(kmeans_labels)
        kmeans_labels_arr.append(ind_kmeans)
    # print ('kmeans的最好CR值为:%s'%bestCr_arr)
    return kmeans_labels_arr, bestAri_arr
示例#9
0
文件: k_means.py 项目: sreev/lale
 def __init__(self,
              n_clusters=8,
              init='k-means++',
              n_init=10,
              max_iter=300,
              tol=0.0001,
              precompute_distances='auto',
              verbose=0,
              random_state=None,
              copy_x=True,
              n_jobs=None,
              algorithm='auto'):
     self._hyperparams = {
         'n_clusters': n_clusters,
         'init': init,
         'n_init': n_init,
         'max_iter': max_iter,
         'tol': tol,
         'precompute_distances': precompute_distances,
         'verbose': verbose,
         'random_state': random_state,
         'copy_x': copy_x,
         'n_jobs': n_jobs,
         'algorithm': algorithm
     }
     self._wrapped_model = SKLModel(**self._hyperparams)
示例#10
0
 def fit(self, X, y=None):
     self._sklearn_model = SKLModel(**self._hyperparams)
     if (y is not None):
         self._sklearn_model.fit(X, y)
     else:
         self._sklearn_model.fit(X)
     return self
示例#11
0
def extract_word_clusters(commentList, commentCount):
    brown_ic = wordnet_ic.ic('ic-brown.dat')
    a, corpus, global_synsets = extract_global_bag_of_words(commentList, True)
    similarity_dict = {}
    i = 0
    t = len(global_synsets)**2
    
    for syn_out in global_synsets:
        similarity_dict[syn_out] = {} 
        for syn_in in global_synsets:
            if syn_in.pos() == syn_out.pos():
                similarity_dict[syn_out][syn_in] = syn_out.lin_similarity(syn_in, brown_ic)
            else:
                similarity_dict[syn_out][syn_in] = max(wn.path_similarity(syn_out,syn_in), wn.path_similarity(syn_in,syn_out))
        
            if i % 10000 == 0:
                print i, 'synsets processed out of',len(global_synsets)**2, '(',float(i)/(t),'%)'
            i += 1

    tuples = [(i[0], i[1].values()) for i in similarity_dict.items()] 
    vectors = [np.array(tup[1]) for tup in tuples]

    
    # Rule of thumb
    n = sqrt(len(global_synsets)/2)
    print "Number of clusters", n
    km_model = KMeans(n_clusters=n)
    km_model.fit(vectors)
    
    clustering = collections.defaultdict(list)
    for idx, label in enumerate(km_model.labels_):
        clustering[label].append(tuples[idx][0])
        
    pprint.pprint(dict(clustering), width=1)
    
    feature_vector = np.zeros([len(corpus),n])
    
    for i,comment in enumerate(corpus):
        for w in comment:
            for key, clust in clustering.items():
                if w in clust:
                    feature_vector[i][key] += 1
        if i % 1000 == 0:
            print i, 'comments processed'
        
    print feature_vector
    '''
def evaluate_k_means_raw(data, true_labels, n_clusters, k_init):
    """
    Clusters data with K-Means algorithm and then returns clustering accuracy and NMI
    :param data: Points that need to be clustered as a numpy array
    :param true_labels: True labels for the given points
    :param n_clusters: Total number of clusters
    :return: ACC, NMI
    """
    # https://github.com/Datamine/MNIST-K-Means-Clustering/blob/master/Kmeans.ipynb
    # http://johnloeber.com/docs/kmeans.html
    # Llyod's Algorithm for K-Means Clustering

    kmeans = KMeans(n_clusters=n_clusters, n_init=k_init)
    kmeans.fit(data)
    acc = cluster_acc(true_labels, kmeans.labels_)
    nmi = metrics.normalized_mutual_info_score(true_labels, kmeans.labels_)
    return acc, nmi
示例#13
0
def getClustering(method_name="k-mean", param_map={}):
    if method_name == "k-mean":
        from sklearn.cluster.k_means_ import KMeans
        return KMeans(**param_map)
    elif method_name == "dbscan":
        from sklearn.cluster import DBSCAN
        return DBSCAN(**param_map)
    return None
示例#14
0
def getClusters(input_data):
    km = KMeans(n_clusters=10, random_state=0).fit(input_data)
    centers = km.cluster_centers_
    np.insert(
        centers, 0, 1, 1
    )  ####=========================== ADD BIAS =====================================##
    print("Centers : ", centers.shape)
    return centers
示例#15
0
def runKMeans(distance_matrix, nClusters, number_of_threads):

    km = KMeans(n_clusters=nClusters,
                max_iter=100,
                init='k-means++',
                precompute_distances=True,
                n_jobs=number_of_threads)
    km.fit(distance_matrix)

    labels = km.labels_
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noises = list(labels).count(-1)

    print('Number of clusters' + str(n_clusters))
    print('Number of noises' + str(n_noises))

    return list(labels)
示例#16
0
def train_k_means(n_clusters, init_type, x_array, y, eps, n_init):
    DIGIT_COUNT = 10
    inertias = []
    iterations = []
    entropys = []
    for i in range(n_init):
        # fill matrix by zero
        n_matrix = np.zeros((n_clusters, DIGIT_COUNT), dtype=np.int)
        if init_type == "random":
            init = "random"
        elif init_type == "k-away":
            init = get_k_away_centers(x_array, n_clusters)
        else:
            raise NotImplementedError

        clf = KMeans(init=init,
                     n_clusters=n_clusters,
                     n_init=1,
                     n_jobs=-1,
                     tol=eps)
        clf.fit(x_array)
        # Q value
        inertias.append(clf.inertia_)
        # iterations number
        iterations.append(clf.n_iter_)
        # labels
        for j in range(len(y)):
            digit = y[j]
            cluster = clf.labels_[j]
            n_matrix[cluster][digit] += 1
        n = float(len(y))

        # print "n_matrix = ", [v for v in n_matrix]
        Hyz = -reduce(lambda s, p: s + (p * math.log(p, 2) if p > 0 else 0), [
            n_matrix[cluster][digit] / n for cluster in range(n_clusters)
            for digit in range(DIGIT_COUNT)
        ], 0.0)
        Hz = -reduce(
            lambda s, p: s + (p * math.log(p, 2) if p > 0 else 0),
            [sum(n_matrix[cluster], 0.0) / n
             for cluster in range(n_clusters)], 0.0)
        # print("Hyz = %s" % Hyz)
        # print("Hz = %s" % Hz)
        entropys.append(Hyz - Hz)
    return iterations, inertias, entropys
示例#17
0
    def trainmodel5(self, filename):
        df = pd.read_csv(filename)
        df.loc[:, 'animal'].replace(['sheep', 'cow'], [1, 2], inplace=True)
        df = df.drop(df[df['scale'] == 0].index)
        x = df.loc[:, 'animal':'age']
        y = df.loc[:, 'weight']
        self.x = x
        self.y = y
        self.data = df
        self.spliter(x, y)

        self.linier.fit(self.x, self.y)
        n_neighbors = 3
        for i, weights in enumerate(['uniform', 'distance']):
            self.knn = neighbors.KNeighborsRegressor(n_neighbors, weights=weights)
            self.knn.fit(self.x, self.y)
        self.kmeans = KMeans(n_clusters=5)
        self.kmeans.fit(self.x)
示例#18
0
文件: Cluster.py 项目: RenzeLou/MORE
def K_means_BERT(datasets, pred_vector, labels, opt):
    # datasets: a list ,each element is a [3,max_len] array sample
    # pred_vector: a model's function to predict embedding
    # num_classes: num of class
    num_classes = len(np.unique(labels))
    feature_embeddings = model_pred_BERT(datasets, pred_vector, labels, opt)
    kmeans = KMeans(n_clusters=num_classes, n_init=10).fit(feature_embeddings)
    label_list = kmeans.labels_.tolist()
    return label_list, create_msg(label_list), kmeans.cluster_centers_, feature_embeddings
示例#19
0
    def create_train_kmeans(data, number_of_clusters=len(codes)):
        # n_jobs is set to -1 to use all available CPU cores. This makes a big difference on an 8-core CPU
        # especially when the data size gets much bigger. #perfMatters

        k = KMeans(n_clusters=number_of_clusters, n_jobs=-1, random_state=728)
        # Let's do some timings to see how long it takes to train.
        start = time.time()

        # Train it up
        k.fit(data)

        # Stop the timing
        end = time.time()

        # And see how long that took
        print("Training took {} seconds".format(end - start))

        return k
 def __init__(self, tweet_file_path, no_of_clusters):
     """
     The constructor reads csv file and builds the data matrix.
     """
     self.np_extractor = ConllExtractor()
     self.pos_tagger = NLTKTagger()
     self.tweet_file_path = tweet_file_path
     self.data_matrix = self.__get_data_matrix_from_file(tweet_file_path)
     self.vectorizer = DictVectorizer(sparse=True)
     self.k_means_estimator = KMeans(init="random", n_clusters=no_of_clusters)
def test_cifar10():

    (X_train, y_train), (X_test, y_test) = cifar10.load_data()
    X_train = X_train.reshape((50000, 32*32*3))
    X_test  = X_test.reshape((10000, 32*32*3))
    y_train = y_train.reshape((50000))
    y_test  = y_test.reshape((10000))


    distortions = []
    X_ = X_test[y_test==4]
    K = range(1,30)
    for k in K:
        kmeanModel = KMeans(n_clusters=k, random_state=84)
        kmeanModel.fit(X_)
        distortions.append(sum(np.min(cdist(X_, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X_.shape[0])

    # Plot the elbow
    plt.plot(K, distortions, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Distortion')
    plt.title('The Elbow Method showing the optimal k')
    plt.show()


    #alg = LogisticRegressionCV(Cs=[1], multi_class='ovr', n_jobs=-1, random_state=84)
    #alg.fit(X_train, y_train)
    #y_pred = alg.predict(X_test)
    #score = accuracy_score(y_test, y_pred)
    #print(score)

    pl = PluralizatorClassifier(
              LogisticRegressionCV(Cs=[1], multi_class='ovr', n_jobs=-1, random_state=84),
              'k-means',
              { 0:3, 1:3, 2:3, 3:3, 4:3, 5:3, 6:3, 7:3, 8:3, 9:3 },
              random_state=84,
              n_jobs=-1)
    pl.fit(X_train, y_train)
    y_pred = pl.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    print(score)

    return
示例#22
0
def initial_kmeans(k, rand_state, data):
    min_clusters, max_clusters = k_range(k)  # 根据真实类标签数得到实验所用的簇数量范围
    kmeans_labels_arr = []  # 每一个k簇值的最好划分的集合
    for clusters in range(min_clusters, max_clusters):
        kmeans_labels = KMeans(n_clusters=clusters,
                               random_state=rand_state).fit_predict(data)
        ind_kmeans = creator.Individual(kmeans_labels)

        kmeans_labels_arr.append(ind_kmeans)
    return kmeans_labels_arr
示例#23
0
def main():
    predicted_labelAll = []
    datamat, datalabels = loadDataset("../dataset/iris.data")
    print 'data ready'
    nmi_max = -inf
    ari_max = -inf
    for i in range(10):
        clusters = random.randint(2, 11)
        predicted_label = KMeans(n_clusters=clusters).fit_predict(datamat)
        predicted_label = predicted_label.tolist()
        nmi = normalized_mutual_info_score(datalabels, predicted_label)
        ari = adjusted_rand_score(datalabels, predicted_label)
        if nmi > nmi_max:
            nmi_max = nmi
        if ari > ari_max:
            ari_max = ari
    print('nmi值为:')
    print(nmi_max)
    print('ari值为:')
    print(ari_max)
示例#24
0
def simulation(n, n_clusters, k_range, dim, runs=100):
    all_data = []
    k_low, k_hi = k_range
    for idx in range(runs):
        data, labels = make_blobs(n_samples=n,
                                  n_features=dim,
                                  centers=n_clusters,
                                  cluster_std=0.1,
                                  center_box=(-1.0, 1.0))

        for k in range(k_low, k_hi + 1):
            # Get a model specified, fit to data, score for error, mark error as -1 if fails
            model = KMeans(n_clusters=k, random_state=0)
            labels = model.fit_predict(data)
            avg_score = silhouette_score(data, labels)
            all_data.append([n, n_clusters, k, dim, avg_score])

    df = pd.DataFrame(all_data,
                      columns=['n', 'n_clusters', 'k', 'dim', 'avg_score'])
    return df
示例#25
0
def run_kmeans(data,label,k=3,fname="../results/kmeans"):
    if len(data) < k:
        return
    vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000,stop_words='english', use_idf=True)
    clean_data = get_clean_data(data)
    X = vectorizer.fit_transform(clean_data)
    km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
    km.fit(X)
    print label,np.bincount(km.labels_)
    assert len(km.labels_) == len(data)
    f = open(fname+str(int(label))+".csv",'w')
    f.write("subject\tbody\tcluster_id\n")
    for i in range(len(data)):
        subject,body = data[i]
        subject  = " ".join(str(subject).split())
        body  = " ".join(str(body).split())
        cluster_id = str(km.labels_[i])
        row = data[i]
        f.write(subject+"\t"+body+"\t"+cluster_id+'\n')
    f.close()
示例#26
0
def bisection(max_k: int, data: np.ndarray) -> tree_node:
    current_k = 1
    data_centroid = np.mean(data, 0)
    root = tree_node(0, data_centroid)
    root_sse = sum_square_error(data_centroid, data)
    next_split_order = 1
    next_node_id = 1
    queue = PriorityQueue()
    queue.put((-1.0 * root_sse, root, data))

    # print(f"rootsse {root.sse}")
    while current_k < max_k:
        _, leaf_to_split, split_data = queue.get()
        # print(f"leaf_to_split sse {leaf_to_split.sse}")
        leaf_to_split.split_order = next_split_order
        next_split_order += 1
        k = KMeans(2)
        labels = np.array(k.fit_predict(split_data), dtype=np.float32)
        labels = labels.reshape([len(labels), 1])

        left_idx = np.asanyarray([i for i in range(split_data.shape[0]) if labels[i] == 0])
        left_data = split_data[left_idx, :]
        left_child = tree_node(next_node_id, np.mean(left_data, 0))
        next_node_id += 1
        leaf_to_split.left_child = left_child
        queue.put((-1.0 * sum_square_error(left_child.centroid, left_data), left_child, left_data))
        # print(f"left_child sse {left_child.sse}")

        right_idx = np.asanyarray([i for i in range(split_data.shape[0]) if labels[i] == 1])
        right_data = split_data[right_idx, :]
        right_child = tree_node(next_node_id, np.mean(right_data, 0))
        next_node_id += 1
        leaf_to_split.right_child = right_child
        queue.put((-1.0 * sum_square_error(right_child.centroid, right_data), right_child, right_data))
        # print(f"right_child sse {right_child.sse}")

        current_k += 1  # it is only one leaf node more

    _assign_leaf_ids(root)

    return root
示例#27
0
def train_k_means_by_step(n_clusters, init_cluster_centers, x_array, eps):
    # eps = 1e-4
    # eps = 0.1
    # eps = 100.0
    # prev_sample = np.array(clf.cluster_centers_, np.float)
    prev_centers = init_cluster_centers
    clf = KMeans(init=prev_centers,
                 n_clusters=n_clusters,
                 n_init=1,
                 n_jobs=-1,
                 tol=eps,
                 max_iter=1)
    # if isinstance(prev_centers, str):
    #     prev_centers = clf.cluster_centers_
    clf.fit(x_array)
    new_centers = clf.cluster_centers_

    centers_list = [prev_centers, new_centers]
    args = [1]
    values = [clf.inertia_]
    while get_distance(prev_centers, new_centers) > eps:
        prev_centers = new_centers
        clf = KMeans(init=prev_centers,
                     n_clusters=n_clusters,
                     n_init=1,
                     n_jobs=-1,
                     tol=eps,
                     max_iter=1).fit(x_array)
        new_centers = clf.cluster_centers_
        args.append(len(args) + 1)
        values.append(clf.inertia_)
        centers_list.append(new_centers)
    # print "k = %s, len centers = %s" % (n_clusters, len(f_values))
    return args, values, centers_list
示例#28
0
def plot_job_cluster(n_clusters, no_jobs, subset, kmeans=None):
    if not kmeans:
        kmeans = KMeans(n_clusters=n_clusters)
    job_predict = kmeans.fit_predict(subset)
    plot_california_counties()
    for i in range(n_clusters):
        mean_jobs = np.mean(
            [no_jobs[j] for j in range(len(no_jobs)) if job_predict[j] == i])
        plt.scatter(
            [subset[j][0] for j in range(len(subset)) if job_predict[j] == i],
            [subset[j][1] for j in range(len(subset)) if job_predict[j] == i],
            label=f"Mean No. Jobs:{mean_jobs:.0f}",
            s=4.5)

    # city_labels()
    plt.legend()
    plt.gca().set_xlabel("Longitude")
    plt.gca().set_ylabel("Latitude")
    plt.xlim((-120, -116))
    plt.ylim((33, 35))
    plt.axis('equal')
    plt.show()
示例#29
0
def test_KMeansConstrained_parity_digits():
    iris = datasets.load_iris()
    X = iris.data

    k = 8
    random_state = 1
    size_min, size_max = None, None  # No restrictions and so should produce same result

    clf_constrained = KMeansConstrained(size_min=size_min,
                                        size_max=size_max,
                                        n_clusters=k,
                                        random_state=random_state,
                                        init='k-means++',
                                        n_init=10,
                                        max_iter=300,
                                        tol=1e-4)
    y_constrained = clf_constrained.fit_predict(X)

    # TODO: Testing scikit-learn has be set to v0.19. This is because there is a discrepancy scikit-learn v0.22 https://github.com/scikit-learn/scikit-learn/issues/16623
    clf_kmeans = KMeans(n_clusters=k,
                        random_state=random_state,
                        init='k-means++',
                        n_init=10,
                        max_iter=300,
                        tol=1e-4)
    y_kmeans = clf_kmeans.fit_predict(X)

    # Each cluster should have the same number of datapoints assigned to it
    constrained_ndp = pd.Series(y_constrained).value_counts().values
    kmeans_ndp = pd.Series(y_kmeans).value_counts().values

    assert_almost_equal(constrained_ndp, kmeans_ndp)

    # Sort the cluster coordinates (otherwise in a random order)
    constrained_cluster_centers = sort_coordinates(
        clf_constrained.cluster_centers_)
    kmean_cluster_centers = sort_coordinates(clf_kmeans.cluster_centers_)

    assert_almost_equal(constrained_cluster_centers, kmean_cluster_centers)
示例#30
0
def run_kmeans(data, label, k=3, fname="../results/kmeans"):
    if len(data) < k:
        return
    vectorizer = TfidfVectorizer(max_df=0.5,
                                 max_features=10000,
                                 stop_words='english',
                                 use_idf=True)
    clean_data = get_clean_data(data)
    X = vectorizer.fit_transform(clean_data)
    km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
    km.fit(X)
    print label, np.bincount(km.labels_)
    assert len(km.labels_) == len(data)
    f = open(fname + str(int(label)) + ".csv", 'w')
    f.write("subject\tbody\tcluster_id\n")
    for i in range(len(data)):
        subject, body = data[i]
        subject = " ".join(str(subject).split())
        body = " ".join(str(body).split())
        cluster_id = str(km.labels_[i])
        row = data[i]
        f.write(subject + "\t" + body + "\t" + cluster_id + '\n')
    f.close()
示例#31
0
文件: Cluster.py 项目: RenzeLou/MORE
def K_means(datasets, pred_vector, num_classes, opt):
    '''
    Args:
        datasets: a list ,each element is a [3,max_len] array sample
        pred_vector: a model's function to predict embedding
        num_classes: num of class

    Returns:
        K-means results -- a tuple(label_list, message, cluster_centers, features)
    '''
    feature_embeddings = model_pred(datasets, pred_vector, opt)
    kmeans = KMeans(n_clusters=num_classes, n_init=10).fit(feature_embeddings)
    label_list = kmeans.labels_.tolist()
    return label_list, create_msg(label_list), kmeans.cluster_centers_, feature_embeddings
示例#32
0
def perLabel(label_name, labels, sample_size, n_clusters):
    print(79 * '_')
    print label_name
    print(
        '% 9s' % 'feature'
        '    time  inertia    h**o   compl  v-meas     ARI AMI  silhouette')
    #print "number of distinct classes for true labels for ",label_name, len(Counter(labels))
    estimator = KMeans(n_clusters=n_clusters)
    bench_k_means(labels, sample_size, estimator, "RGB", rgb_data)
    bench_k_means(labels, sample_size, estimator, "LAB", lab_data)
    bench_k_means(labels, sample_size, estimator, "HOG", hog_data)
    bench_k_means(labels, sample_size, estimator, "GIST", gist_data)
    bench_k_means(labels, sample_size, estimator, "SURF", surf_data)
    bench_k_means(labels, sample_size, estimator, "SIFT", sift_data)
    bench_k_means(labels, sample_size, estimator, "ORB", orb_data)
    def get_data_for_kl_loss(self, encode_output, label_list, n_clusters):
        """
        returns centroids for KL-divergence loss
        :param encode_output: encoder output
        :param label_list: labels for the encoder output
        :param n_clusters: number of clusters
        :return: centroids
        """

        # if self.use_cuda is False:
        #     data = np.copy(encode_output.data)
        #     label = np.copy(label_list.data)
        # else:
        #     data = np.copy(encode_output.data.cpu())
        #     label = np.copy(label_list.data.cpu())

        data = encode_output
        data_len = len(data)

        if data_len < n_clusters:
            n_clusters = data_len

        kmeans = KMeans(init='k-means++',
                        n_clusters=n_clusters,
                        n_init=self.k_init)

        # Fitting the input data
        kmeans.fit(data)

        # Centroid values
        centroids = kmeans.cluster_centers_

        if self.use_cuda:
            return Variable(torch.from_numpy(centroids).float().cuda())

        return Variable(torch.from_numpy(centroids).float())
示例#34
0
def compute_bench(samples_range, features_range):

    it = 0
    iterations = 200
    results = defaultdict(lambda: [])
    chunk = 100

    max_it = len(samples_range) * len(features_range)
    for n_samples in samples_range:
        for n_features in features_range:
            it += 1
            print '=============================='
            print 'Iteration %03d of %03d' % (it, max_it)
            print '=============================='
            print ''
            data = nr.random_integers(-50, 50, (n_samples, n_features))

            print 'K-Means'
            tstart = time()
            kmeans = KMeans(init='k-means++', k=10).fit(data)

            delta = time() - tstart
            print "Speed: %0.3fs" % delta
            print "Inertia: %0.5f" % kmeans.inertia_
            print ''

            results['kmeans_speed'].append(delta)
            results['kmeans_quality'].append(kmeans.inertia_)

            print 'Fast K-Means'
            # let's prepare the data in small chunks
            mbkmeans = MiniBatchKMeans(init='k-means++',
                                       k=10,
                                       chunk_size=chunk)
            tstart = time()
            mbkmeans.fit(data)
            delta = time() - tstart
            print "Speed: %0.3fs" % delta
            print "Inertia: %f" % mbkmeans.inertia_
            print ''
            print ''

            results['minibatchkmeans_speed'].append(delta)
            results['minibatchkmeans_quality'].append(mbkmeans.inertia_)

    return results
示例#35
0
def compute_bench(samples_range, features_range):

    it = 0
    results = defaultdict(lambda: [])
    chunk = 100

    max_it = len(samples_range) * len(features_range)
    for n_samples in samples_range:
        for n_features in features_range:
            it += 1
            print('==============================')
            print('Iteration %03d of %03d' % (it, max_it))
            print('==============================')
            print()
            data = nr.randint(-50, 51, (n_samples, n_features))

            print('K-Means')
            tstart = time()
            kmeans = KMeans(init='k-means++', n_clusters=10).fit(data)

            delta = time() - tstart
            print("Speed: %0.3fs" % delta)
            print("Inertia: %0.5f" % kmeans.inertia_)
            print()

            results['kmeans_speed'].append(delta)
            results['kmeans_quality'].append(kmeans.inertia_)

            print('Fast K-Means')
            # let's prepare the data in small chunks
            mbkmeans = MiniBatchKMeans(init='k-means++',
                                       n_clusters=10,
                                       batch_size=chunk)
            tstart = time()
            mbkmeans.fit(data)
            delta = time() - tstart
            print("Speed: %0.3fs" % delta)
            print("Inertia: %f" % mbkmeans.inertia_)
            print()
            print()

            results['MiniBatchKMeans Speed'].append(delta)
            results['MiniBatchKMeans Quality'].append(mbkmeans.inertia_)

    return results
示例#36
0
def perform():
    #imagesHandler.load_images()
    #colourHandler.extract_colour_distribution_from_all_images("RGB")
    RGB_data = colourHandler.getColourDistForAllImages("RGB")
    RGB_data = np.array(RGB_data, dtype=None)
    RGB_data = np.delete(RGB_data, 0, 1)

    LAB_data = colourHandler.getColourDistForAllImages("LAB")
    LAB_data = np.array(RGB_data, dtype=None)
    LAB_data = np.delete(RGB_data, 0, 1)

    gistVals = util.loadCSV("gistvals")
    gist_data = np.array(gistVals)

    #hogHandler.extract_hog_from_all_images()
    hog_data = hogHandler.getHogValsforAllImages()
    hog_data = np.array(hog_data, dtype=None)
    hog_data = np.delete(hog_data, 0, 1)
    hog_data = np.array(hog_data)

    #surfCodebook.run_codebook(n_clusters,400, 0.3, cv2.INTER_CUBIC, 0)
    surf_data = surf_cb_handler.get_distributions()
    surf_data = np.array(surf_data, dtype=None)
    surf_data = np.delete(surf_data, 0, 1)

    sift_data = sift_cb_handler.get_distributions()
    sift_data = np.array(sift_data, dtype=None)
    sift_data = np.delete(sift_data, 0, 1)

    orb_data = orb_cb_handler.get_distributions()
    orb_data = np.array(surf_data, dtype=None)
    orb_data = np.delete(surf_data, 0, 1)

    est = KMeans(n_clusters=30)

    print(79 * '_')
    print(
        '% 9s' % 'init'
        '    time  inertia    h**o   compl  v-meas     ARI AMI  silhouette')

    bench_k_means(est, "colourPerfomanceVmeta", colour_data)
    bench_k_means(est, "hogPerfomanceVmeta", hog_data)
示例#37
0
def plot_network(n_clusters, subset_job, subset_edu, no_jobs, no_edu):
    plt.figure(figsize=(6, 8))
    job_kmeans = KMeans(n_clusters=n_clusters)
    job_predict = job_kmeans.fit_predict(subset_job)
    empl_edu_kmean = KMeans(n_clusters=n_clusters)
    empl_predict = empl_edu_kmean.fit_predict(subset_edu)

    cluster_sum_jobs, cluster_sum_employ_edu = [], []

    for i in range(n_clusters):
        cluster_sum_employ_edu.append(
            sum_cluster(empl_predict, i, no_edu) / sum(no_edu))
        cluster_sum_jobs.append(
            sum_cluster(job_predict, i, no_jobs) / sum(no_jobs))

    jobs_centres = job_kmeans.cluster_centers_
    emp_edu_centres = empl_edu_kmean.cluster_centers_
    result, all_coords = min_span_tree(jobs_centres, emp_edu_centres,
                                       cluster_sum_jobs,
                                       cluster_sum_employ_edu)
    city_labels()
    plot_california_counties()
    plot_california()
    for i in range(len(result)):
        for j in range(len(result[i])):
            if result[i][j] == 0:  # NO LINK
                continue

            plt.scatter(jobs_centres[i][0] if i < n_clusters else
                        emp_edu_centres[i - n_clusters][0],
                        jobs_centres[i][1] if i < n_clusters else
                        emp_edu_centres[i - n_clusters][1],
                        edgecolors='b',
                        facecolors='none')
            plt.scatter(jobs_centres[j][0] if j < n_clusters else
                        emp_edu_centres[j - n_clusters][0],
                        jobs_centres[j][1] if j < n_clusters else
                        emp_edu_centres[j - n_clusters][1],
                        edgecolors='b',
                        facecolor='none')
            plt.plot(
                (jobs_centres[i][0] if i < n_clusters else
                 emp_edu_centres[i - n_clusters][0], jobs_centres[j][0]
                 if j < n_clusters else emp_edu_centres[j - n_clusters][0]),
                (jobs_centres[i][1] if i < n_clusters else
                 emp_edu_centres[i - n_clusters][1], jobs_centres[j][1] if
                 j < n_clusters else emp_edu_centres[j - n_clusters][1]), 'b-')

    plt.show()
示例#38
0
    def group_by_proximity(self, k=10):
        if len(self.points) == 0:
            return {}

        X = numpy.array([[p.lat, p.lon] for p in self.points])

        distance_matrix = distance.squareform(distance.pdist(X))
        db = KMeans(n_clusters=k).fit(distance_matrix)

        # re-attach ids
        grouped_points = {}
        for i, k in enumerate(db.labels_):
            logger.debug('idx, label [%s, %s]', i, k)
            if k not in grouped_points:
                grouped_points[k] = []
            point = self.points[i]
            grouped_points[k].append({'id': point.uid, 'lat': point.lat, 'lon': point.lon})

        logger.info('Finished grouping into %d groups.', len(grouped_points))
        return grouped_points
class KMeansEstimator:
    """
    This class reads the tweets of users from a file and builds cluster centers on that data. It also provides
    method for finding the closest cluster center of unseen data.
    """
    
    ADJECTIVE = 'JJ'
    
    """
    Feature keys used in clustering...
    """
    POLARITY_FEATURE_KEY = 'polarity'
    SUBJECTIVITY_FEATURE_KEY = 'subjectivity'
    TWEET_COUNT_FEATURE_KEY = 'tweetCount'
    """
    Features not considered for clustering...
    """
    USER_ID_FEATURE_KEY = 'userId'
    LONGITUDE_FEATURE_KEY = 'longitude'
    LATITUDE_FEATURE_KEY = 'latitude'
    
    
    """
    Predicted label feature name.
    """
    LABEL_FEATURE_KEY = 'label'

    RELEVENT_FEATURE_LIST = [USER_ID_FEATURE_KEY, LATITUDE_FEATURE_KEY, LONGITUDE_FEATURE_KEY, LABEL_FEATURE_KEY]
    
    def __init__(self, tweet_file_path, no_of_clusters):
        """
        The constructor reads csv file and builds the data matrix.
        """
        self.np_extractor = ConllExtractor()
        self.pos_tagger = NLTKTagger()
        self.tweet_file_path = tweet_file_path
        self.data_matrix = self.__get_data_matrix_from_file(tweet_file_path)
        self.vectorizer = DictVectorizer(sparse=True)
        self.k_means_estimator = KMeans(init="random", n_clusters=no_of_clusters)
        
    @time_it
    def __get_data_matrix_from_file(self, tweet_file_path):
        """
        Reads tweets from csv file at path "tweet_file_path", extracts features from the tweets and returns list
        of all feature vectors.
        """
        file_reader = csv.reader(open(tweet_file_path, "rb"), delimiter=',')
        next(file_reader)
        data_matrix = []
        for row in file_reader:
            logging.info("Extracting features for user_id:%s", row[0])
            feature_vector = {}
            feature_vector[self.USER_ID_FEATURE_KEY] = int(row[0])
            feature_vector[self.LATITUDE_FEATURE_KEY] = float(row[2])
            feature_vector[self.LONGITUDE_FEATURE_KEY] = float(row[3])
            feature_vector[self.TWEET_COUNT_FEATURE_KEY] = int(row[4])
            feature_vector.update(self.__get_features_from_tweet_text(row[1].decode('utf-8')))
            data_matrix.append(feature_vector)
            logging.info("Successfully extracted features for user_id:%s", row[0])
        return data_matrix
    
    @time_it
    def __get_features_from_tweet_text(self, tweet_text):
        """This function returns the following features from the tweet text:
        - Adjectives and their corresponding frequencies found in the tweet. Each adjective is a separate feature.
        - Subjectivity and polarity as determined by TextBlob.
        :returns:  (key,value) map of all features found. 
        """
        text_blob = TextBlob(tweet_text, np_extractor=self.np_extractor, pos_tagger=self.pos_tagger);
        adjective_map = dict(Counter((ele[0] for ele in set(text_blob.pos_tags) if ele[1] == self.ADJECTIVE)))
        polarity = text_blob.sentiment[0]
        subjectivity = text_blob.sentiment[1]
        return dict(adjective_map.items() + {self.POLARITY_FEATURE_KEY:polarity, self.SUBJECTIVITY_FEATURE_KEY:subjectivity}.items())
    
    @time_it
    def __get_clustering_data_matrix(self, data_matrix):
        """
        This method removes unnecessary features(features like user_id which are not relevant for building cluster centers) from
        the data matrix and returns a copy of the data matrix.
        """
        data_matrix_copy = copy.deepcopy(data_matrix)
        for feature_vector in data_matrix_copy:
            feature_vector.pop(self.USER_ID_FEATURE_KEY)
            feature_vector.pop(self.LATITUDE_FEATURE_KEY)
            feature_vector.pop(self.LONGITUDE_FEATURE_KEY)
        return data_matrix_copy


    @time_it
    def perform_clustering(self, features_to_include=None):
        """
        This function performs k-means clustering with "no_of_clusters" clusters of the data present in file at
        "tweet_file_path".
        It returns list of feature vector, where each feature vector contains only "features_to_include" or all features
        if "features_to_include" is None.
        """
        clustering_data_matrix = self.__get_clustering_data_matrix(self.data_matrix)
        transformed_data_matrix = self.vectorizer.fit_transform(clustering_data_matrix)
        
        self.k_means_estimator.fit(transformed_data_matrix, y=None)
        return self.__get_predicted_labels(self.data_matrix, features_to_include)

    @time_it    
    def __get_predicted_labels(self, data_matrix, features_to_include):
        """
        Finds the nearest cluster for all data points and adds a new feature label in all feature vectors of data matrix. The
        data matrix is modified in place.
        It returns a new copy of data_matrix with "features_to_include" features.
        """
        feature_names = self.vectorizer.get_feature_names()
        for feature_vector in data_matrix:
            row = [0] * len(feature_names)
            column = range(len(feature_names))
            data = map(lambda feature_name:feature_vector[feature_name] if feature_name in feature_vector else 0, feature_names)
            feature_csr_matrix = csr_matrix(coo_matrix((data, (row, column))))
            predicted_label = self.k_means_estimator.predict(feature_csr_matrix)
            feature_vector[self.LABEL_FEATURE_KEY] = predicted_label[0]
        
        expanded_data_matrix = self.__get_expanded_data_matrix(data_matrix)
        if features_to_include:
            return self.__get_filtered_data_matrix(expanded_data_matrix, features_to_include)
        else:
            return expanded_data_matrix

    @time_it
    def __get_filtered_data_matrix(self, data_matrix, features_to_include):
        """
        Removes all features except features_to_include
        """
        filtered_data_matrix = []
        for feature_vector in data_matrix:
            filtered_feature_vector = {}
            for feature_name in features_to_include:
                filtered_feature_vector[feature_name] = feature_vector[feature_name]
            filtered_data_matrix.append(filtered_feature_vector)
        return filtered_data_matrix
    
    @time_it
    def __get_expanded_data_matrix(self, data_matrix):
        """
        Adds new keys for missing features to all feature vectors of data_matrix. The data matrix is not modified, but a new 
        modified copy is returned.
        """
        feature_names = self.vectorizer.get_feature_names()
        expanded_data_matrix = copy.deepcopy(data_matrix)
        for feature_vector in expanded_data_matrix:
            for feature_name in feature_names:
                if feature_name not in feature_vector:
                    feature_vector[feature_name] = 0
        return expanded_data_matrix
    
    @time_it
    def predict_labels_for_data(self, file_path, features_to_include=None):
        """
        This function reads the tweets of different users from the file at file_path and assigns the closest 
        cluster center to each user.
        It returns list of tuples of (user_id,predicted_label,latitude, longitude).
        """
        data_matrix = self.__get_data_matrix_from_file(file_path)
        return self.__get_predicted_labels(data_matrix, features_to_include)
示例#40
0
import numpy
import os
from sklearn.cluster.k_means_ import KMeans
import cPickle
import sys

# Performs K-means clustering and save the model to a local file

if __name__ == "__main__":
    if len(sys.argv) != 4:
        print "Usage: {0} sift_file cluster_num output_file".format(sys.argv[0])
        print "sift_file -- path to the sift file"
        print "cluster_num -- number of cluster"
        print "output_file -- path to save the k-means model"
        exit(1)

    sift_file = sys.argv[1]
    output_file = sys.argv[3]
    cluster_num = int(sys.argv[2])
    # Read data
    X = numpy.genfromtxt(sift_file, delimiter=";")
    # Fit model
    estimator = KMeans(n_clusters=cluster_num)
    estimator.fit(X)
    # Dump model
    with open(output_file, "wb") as f:
        cPickle.dump(estimator, f)

    print "K-means trained successfully!"
示例#41
0
from sklearn.cluster.k_means_ import KMeans
from sklearn.datasets.samples_generator import make_blobs

np.random.seed(0)

batch_size = 45
centers = [[1, 1], [-1, -1], [1, -1]]
n_clusters = len(centers)
X, labels_true = make_blobs(n_samples=1000, centers=centers, cluster_std=0.4)

# Draw randoms
indxs = np.arange(1000)
np.random.shuffle(indxs)
centroids = X[indxs[:3]]

k_means = KMeans(k=3, max_iter=1, init=centroids)
k_means.fit(X)
k_means_labels1 = k_means.labels_
k_means_cluster_centers1 = k_means.cluster_centers_

k_means = KMeans(k=3, max_iter=2, init=centroids)
k_means.fit(X)
k_means_labels2 = k_means.labels_
k_means_cluster_centers2 = k_means.cluster_centers_

k_means = KMeans(k=3, max_iter=3, init=centroids)
k_means.fit(X)
k_means_labels3 = k_means.labels_
k_means_cluster_centers3 = k_means.cluster_centers_

k_means = KMeans(k=3, max_iter=4, init=centroids)
示例#42
0
def main():

    """CONFIGURATION"""
    num_clusters = 5; #Number of clusters
    random = False #If true, it will randomly assign clusters to the states w/ equal prob. If false, it will actually computer the clusters.
    working_dir = "/home/jmaxk/proj/geoloc/cluster/fb1/" #The input working_dir, which has 1 file per class. Each file contains the results of the linguistic ethnography tool

    """END CONFIGURATION"""

    if random:
        saveFiles = getSaveFiles(working_dir + 'results/random')
    else:
        saveFiles = getSaveFiles(working_dir + 'results/real')

    clusterFile = saveFiles[0]
    mapFile = saveFiles[1]
    featureIndeces = dict()
    classIndeces = []
    counter =0
    vecs = []


    #Turn each file into a vector to be clustered. Note
    for root, dirs, files in os.walk(working_dir):
        for f in files:
            fullpath = os.path.join(root, f)
            if os.path.splitext(fullpath)[1] == '.txt':
                with open(fullpath) as fp:
                    lines = fp.readlines()
                    vec = [0.0]*(len(lines) + 1)
                    for line in lines:
                        featVals = line.split(' ')
                        key = featVals[0]
                        val = featVals[1]
                        if not featureIndeces.has_key(key):
                            featureIndeces[key] = counter
                            counter = counter + 1
                        index = featureIndeces.get(key);
                        vec[index] = float(val)
                    vecs.append(vec)
                    abbr = os.path.basename(fullpath).split(".")[0]

                    #we only want to save actual states
                    if (us.states.lookup(abbr) != None):
                        st = (str(us.states.lookup(abbr).name))
                        classIndeces.append(st)

        #transform data into numpy array
        mylist = []
        for item in vecs:
            mylist.append(numpy.array(item))
        data = numpy.array(mylist)

        #cluster with kmeans, and save the clusters
        km = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=100, n_init=10,
                verbose=False)
        raw_results =  km.fit_predict(data)
        results = dict(zip(classIndeces, raw_results))
        saveClusters(data,km, clusterFile) #this doesn't working_dir with random

#   save the map
    if random:
        random_results = dict()
        for key in results:
            random_results[key] = randint(0,5)
        colors = genColors(random_results)
        saveMap(random_results,colors, mapFile)
    else:
        colors = genColors(results)
        saveMap(results,colors, mapFile)