Exemplo n.º 1
0
 def build_model(self):
     try:
         num_of_runs = self.Num_of_runs_val.get()
         num_of_clusters = self.Num_of_clusters_val.get()
         if (num_of_runs < 1):
             tkMessageBox.showerror("K Means Clustering",
                                    'invalid number of runs')
             return
         if (num_of_clusters < 2 or num_of_clusters > 164):
             tkMessageBox.showerror(
                 "K Means Clustering",
                 'number of cluster have to be grater then 1 and smaller then 164 (number of recordes)'
             )
             return
         model.k_means(num_of_clusters, num_of_runs)
         scatter_path = model.plot_scatter()
         map_path = model.plot_map()
         root.geometry("1600x600+200+200")
         self.scatter_img = ImageTk.PhotoImage(Image.open(scatter_path))
         self.scatter_img_label = tk.Label(
             self, image=self.scatter_img).grid(row=4,
                                                column=0,
                                                columnspan=3)
         self.map_img = ImageTk.PhotoImage(Image.open(map_path))
         self.map_img_label = tk.Label(self, image=self.map_img).grid(
             row=4, column=3, columnspan=3)
         answer = tkMessageBox.askokcancel(
             "K Means Clustering",
             'Clustring complited successfully, do you want to exit the program?'
         )
         if (answer):
             self.master.quit()
     except ValueError as ve:
         tkMessageBox.showerror("K Means Clustering", ve.message)
Exemplo n.º 2
0
def clustering():
    try:
        n_clusters = int(request.args.get('num_clusters', 4))
        assert n_clusters > 0
    except (ValueError, AssertionError):
        raise abort(400, 'num_clusters must be a positive integer')

    try:
        max_iterations = int(request.args.get('max_iterations', 300))
        assert max_iterations > 0
    except (ValueError, AssertionError):
        raise abort(400, 'max_iterations must be a positive integer')

    try:
        points = parse_input(request.data.decode('ascii'))
    except ValueError:
        raise abort(400, 'Input data must be matrix MxN of float numbers')

    if not points.size:
        raise abort(400, 'Input matrix cannot be empty')

    labels = k_means(points, n_clusters, max_iterations)
    labels += 1  # normalize labels to begin from 1

    response = make_response('[{}]'.format(';\n'.join(
        ', '.join(f'{x}' for x in list(row) + list(label))
        for row, label in zip(points, labels))))
    response.headers['Content-Type'] = 'text/plain; charset=utf-8'
    return response
Exemplo n.º 3
0
def test_model():
    """ Comparing results between our implementation and SKLearn with same set of init centers"""

    df = pandas.read_csv(DATASET_URL, sep='\t')
    X = df[['Distance_Feature', 'Speeding_Feature']].as_matrix()
    np.random.shuffle(X)
    X = X[:500]

    test_centers = initial_centers(X, 4)

    our_result = k_means(X,
                         n_clusters=4,
                         init=test_centers,
                         convergence_threshold=1e-6)
    sample_result = KMeans(n_clusters=4,
                           init=test_centers,
                           n_init=1,
                           algorithm='full').fit(X)

    our_pmf = Counter(our_result.reshape(-1)).values()
    sample_pmf = Counter(sample_result.labels_.reshape(-1)).values()

    print(our_pmf, sample_pmf)
    return set(our_pmf) == set(sample_pmf)
Exemplo n.º 4
0
        with open('tweets.pkl', 'wb') as output:
            pickle.dump(tweets, output, pickle.HIGHEST_PROTOCOL)
    except:
        pass
    try:
        with open('org_tweets.pkl', 'wb') as output:
            pickle.dump(orginal_content, output, pickle.HIGHEST_PROTOCOL)
    except:
        pass

    print('write file completed')
# tfs = create_combined_vector(tweets, word_vec.model)
tfs = create_vector(tweets)
print(np.where(np.isnan(tfs)))
k = 50
km = k_means(tfs, k)

new_json = {}

for i in set(km.labels_):
    current_cluster_bills = [
        orginal_content[x] for x in np.where(km.labels_ == i)[0]
    ]
    new_json['class_' + str(i)] = current_cluster_bills

with open('out.json', 'w') as outfile:
    json.dump(new_json, outfile)

#plt.hist(km.labels_, bins=k)
#plt.show()
Exemplo n.º 5
0
    # Calcular valores de los features
    print('Feature norm l1', dtlib.feature_norm_l1(dt))
    print('Feature mean', dtlib.feature_mean(dt))
    print('Feature std', dtlib.feature_std(dt))

    # Calcular la VA exponencial
    # exponential_vectorize = np.vectorize(my_exponential)
    # dt_exp = exponential_vectorize(dt)
    # plt.hist(dt_exp)
    # plt.show()

    # Reducir a 2 dimensiones y plotear
    dt2 = dtlib.pca(dt, 2)

    # Calcular los centros con k-means
    kmeans_centroids, kmeans_cluster_ids = model.k_means(dt2, n_clusters, 100)
    #kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(dt2)

    # Comparar y plotear
    plt.scatter(dt2[:, 0], dt2[:, 1])
    plt.scatter(kmeans_centroids[:, 0], kmeans_centroids[:, 1])
    #plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1])
    plt.show()
    '''
    Conclusion:
    A pesar de haber agregado ruido en el datasets (NaN reemplazados por mean)
    se puede apreciar a simple vista 4 conjuntos dominantes y el algoritmo
    de k-means los identifica sin problemas
    '''

    # Repetir procedimiento con un dataset cuyos centroidos estén más cerca