예제 #1
0
 def get_new_features_by_clustering_features(self, n_clusters):
     """
     use k-means to group features. As features in each group can be considered similar,
     replace a group of “similar” features by a cluster centroid,  which becomes a new feature.
     """
     clusterer = sk_KMeans(n_clusters=n_clusters).fit(self.X_df.values.T)
     return clusterer.cluster_centers_.T
예제 #2
0
 def __init__(self, path_to_data_csv, model_params, random_state=0):
     self._kmeans = \
         sk_KMeans(
             n_clusters = model_params['num_clusters'],
             n_init = model_params['n_init'],
             random_state=random_state
         )
     self.concat_data, self.labels, self.data_by_sample = self._load_data(path_to_data_csv, model_params)
     self.logistic_regressor = LogisticRegression(penalty='l1', C=1e10)
예제 #3
0
def test_k_means_python(benchmark, make_data):
    dataset, cluster_index = make_data
    model = sk_KMeans(3,
                      init="random",
                      algorithm="full",
                      max_iter=100,
                      tol=1e-4,
                      n_init=1)
    labels = benchmark(model.fit_predict, dataset)
    assert len(labels) == len(cluster_index)
예제 #4
0
    def fit(x_, **kmeans_params_):
        if batch_version is False:
            try:
                model = sk_KMeans(**kmeans_params_).fit(x_)
            except MemoryError:
                warnings.warn("An MemoryError occurred during the execution of k-Means non-batch version. Be careful "
                              "that all provides parameters via `kmeans_params` are accept by the batch version and "
                              "have the desired value.")
                model = sk_MiniBatchKMeans(**kmeans_params_).fit(x_)
        else:
            model = sk_MiniBatchKMeans(**kmeans_params_).fit(x_)

        return model.cluster_centers_, model.labels_
예제 #5
0
    def fit(self, X):
        """
        fit the model by the training data.

        Args:
            X: training data
        """
        model = sk_KMeans(n_clusters=self.n_clusters).fit(X)
        self.model = model
        self.cluster_centers_ = model.cluster_centers_
        self.labels_ = model.labels_
        self.inertia_ = model.inertia_
        self.data = X
예제 #6
0
    def select_clusters(self, X, beginning_clusters = 2, end_clusters = 10):
        params = self.get_params()
        n_clusters = []
        inertia = []

        for i in range(beginning_clusters, end_clusters + 1):
            params['n_clusters'] = i
            k_means_model = sk_KMeans(**params)
            k_means_model.fit(X = X)

            n_clusters.append(i)
            inertia.append(k_means_model.inertia_)

        return n_clusters, inertia
예제 #7
0
    def execute(self):
        """Constroi o modelo de clusterizacao."""
        self.model = sk_KMeans(n_clusters=self.n_clusters,
                               init=self.init,
                               n_init=self.n_init,
                               max_iter=self.max_iter,
                               tol=self.tol,
                               precompute_distances=self.precompute_distances,
                               verbose=self.verbose,
                               random_state=self.random_state,
                               copy_x=self.copy_x,
                               n_jobs=self.n_jobs,
                               algorithm=self.algorithm).fit(self.data)

        self.clusters = super().make_clusters(self.data, self.model.labels_)
예제 #8
0
파일: fun.py 프로젝트: napoler/albert-K
def find_k(presentence_embedding, max=10):
    """
    选择拐点
    https://stackoverflow.com/questions/19197715/scikit-learn-k-means-elbow-criterion
    
    """
    from sklearn.metrics import silhouette_score
    X = presentence_embedding
    p_y = []
    p_x = []
    for n_cluster in range(2, max):
        kmeans = sk_KMeans(n_clusters=n_cluster).fit(X)
        label = kmeans.labels_
        sil_coeff = silhouette_score(X, label, metric='euclidean')
        print("For n_clusters={}, The Silhouette Coefficient is {}".format(
            n_cluster, sil_coeff))
        p_y.append(sil_coeff)
        p_x.append(n_cluster)

    plt.figure()
    plt.plot(p_x, p_y)
    plt.xlabel("k ")
    plt.ylabel("SSE")
    plt.show()
예제 #9
0
    def fit2(self, markings,user_ids,jpeg_file=None,debug=False):
        for n in range(1,len(markings)):
            kmeans = sk_KMeans(init='k-means++', n_clusters=n, n_init=10).fit(markings)

            labels = kmeans.labels_
            unique_labels = set(labels)
            #need to check if all clusters are either "clean" or noise
            clean = True
            for k in unique_labels:
                users = [ip for index,ip in enumerate(user_ids) if labels[index] == k]

                if len(users) < self.min_samples:
                    continue

                #we have found a "clean" - final - cluster
                if len(set(users)) != len(users):
                    clean = False
                    break

            if clean:
                break

        print n
        return None,None,None
예제 #10
0
    add_ClusteringServiceServicer_to_server(ClusteringService(model),
                                            grpc_server)
    # Start GRPC Server
    grpc_server.add_insecure_port('[::]:5001')
    grpc_server.start()
    # Keep application alive
    try:
        while True:
            time.sleep(60 * 60 * 24)
    except KeyboardInterrupt:
        grpc_server.stop(0)


if __name__ == "__main__":
    logging.basicConfig()

    n_clusters = 100
    (dataset, labels) = make_blobs(n_clusters)
    if os.getenv("RUST", None) is None:
        model = sk_KMeans(n_clusters,
                          init="random",
                          algorithm="full",
                          max_iter=100)
        model.fit(dataset)
        log(30, "Python model has been loaded")
    else:
        model = KMeans.load("data/rust_k_means_model.json")
        log(30, "Rust model has been loaded")

    serve(model)
예제 #11
0
파일: fun.py 프로젝트: napoler/albert-K
def Pre_KMeans(new_text_list, tokenizer, model, n_cluster=10):
    presentence_embedding, text_list, _ = get_embedding_np(
        new_text_list, [], tokenizer, model)
    kmeans = sk_KMeans(n_clusters=n_cluster).fit(presentence_embedding)
    # print kmeans
    return kmeans.labels_
예제 #12
0
파일: ml.py 프로젝트: sangheestyle/stools
def KMeans(vectors, n_clusters, max_iter):
    km = sk_KMeans(n_clusters=n_clusters, precompute_distances=False,
                           init='k-means++', max_iter=max_iter, n_init=1)
    predict = km.fit_predict(vectors)
    return predict
def test_set():

    data = {}
    timer = Timer()

    for test in range(0, 1):
        path = './data/test' + str(test) + '.dot'
        print("Extracting data from : ", path)
        vertex_set, edge_set = dot_extract(path=path)
        print("Number Of Items : ", len(vertex_set.values()))

        for n_clusters in [3, 5]:
            print("NEW CLUSTER SIZE", n_clusters)
            for seed in range(10):
                print("CLUSTER SIZE", n_clusters, "Iteration : ", seed)
                print("Model 1")
                model_1 = KMeans(random_state=seed,
                                 n_clusters=n_clusters,
                                 n_init=10,
                                 max_iter=300)
                timer.start()
                k_1 = model_1.fit(vertex_set, edge_set)
                timer.stop()
                c_1_dist = evaluate_model(vertex_set, edge_set, k_1)
                l_1_1_norm = np.sum(c_1_dist)
                print("verticies Chosen")
                print(k_1)
                print("centroid distance")
                print(c_1_dist)
                print("l1 norm")
                print(l_1_1_norm)
                print("time")
                print(timer.get_time())

                name = "Model_1_test" + str(test) + "_" + "f_" + str(
                    n_clusters) + "_"
                data[name +
                     "dist"] = data.get(name + "dist", []) + c_1_dist.tolist()
                data[name + "l1"] = data.get(name + "l1", []) + [l_1_1_norm]
                data[name +
                     "t"] = data.get(name + "t", []) + [timer.get_time()]

                print("Model 2")
                model_2 = VoronoiFacilitySelection(random_state=seed,
                                                   max_iter=300,
                                                   n_cells=n_clusters)
                timer.start()
                k_2 = model_2.fit(vertex_set, edge_set)
                timer.stop()
                c_2_dist = evaluate_model(vertex_set, edge_set, k_2)
                l_1_2_norm = np.sum(c_2_dist)
                print("verticies Chosen")
                print(k_2)
                print("centroid distance")
                print(c_2_dist)
                print("l1 norm")
                print(l_1_2_norm)
                print("time")
                print(timer.get_time())

                name = "Model_2_test" + str(test) + "_" + "f_" + str(
                    n_clusters) + "_"
                data[name +
                     "dist"] = data.get(name + "dist", []) + c_2_dist.tolist()
                data[name + "l1"] = data.get(name + "l1", []) + [l_1_2_norm]
                data[name +
                     "t"] = data.get(name + "t", []) + [timer.get_time()]

                print("Model 3")
                timer.start()
                model_3 = sk_KMeans(random_state=seed,
                                    n_clusters=n_clusters).fit(
                                        list(vertex_set.values()))
                timer.stop()
                NN = NearestNeighbors(n_neighbors=1,
                                      radius=0.0000000000001).fit(
                                          np.array(list(vertex_set.values())))
                Y = NN.kneighbors(model_3.cluster_centers_,
                                  1,
                                  return_distance=False)
                k_3 = [list(vertex_set.keys())[i[0]] for i in Y]
                c_3_dist = evaluate_model(vertex_set, edge_set, k_3)
                l_1_3_norm = np.sum(c_3_dist)
                print("verticies Chosen")
                print(k_3)
                print("centroid distance")
                print(c_3_dist)
                print("l1 norm")
                print(l_1_3_norm)
                print("time")
                print(timer.get_time())

                name = "Model_3_test" + str(test) + "_" + "f_" + str(
                    n_clusters) + "_"
                data[name +
                     "dist"] = data.get(name + "dist", []) + c_3_dist.tolist()
                data[name + "l1"] = data.get(name + "l1", []) + [l_1_3_norm]
                data[name +
                     "t"] = data.get(name + "t", []) + [timer.get_time()]

    #pprint(data)
    print("Writing to csv")
    for key, item in data.items():
        with open('data/' + key + '.csv', 'w+', newline='') as csvfile:
            writer = csv.writer(csvfile, delimiter=',')
            writer.writerow([key])
            for d in item:
                writer.writerow([d])