示例#1
0
 def init_q_with_kmeans(self,data):
     '''
     Fonction qui initialise notre algorithme EM
     Paramètres: data:(np.array(nb_samples,nb_composante)) Les échantillons sur lesquels sera calculé EM
     '''
     self.q_e_step = np.zeros([data.shape[0],self.k])
     km = KMeans(self.k)
     km.fit(data)
     prediction = km.predict(data)
     for i in range(data.shape[0]):
         self.q_e_step[i,prediction[i]]=1
示例#2
0
 def _init_parameters(self,data) : 
     if self.init == 'random' : 
         self.mu = data[np.random.choice(data.shape[0], self.k, replace=False)]
         self.pi = [1/self.k for j in range(self.k)]
         self.q = 1/self.k * np.ones((data.shape[0],self.k))
     elif self.init == 'kmeans' :
         clf = KMeans(k=self.k, random_seed=self.random_seed, init='kmeans++')
         clf.fit(data)
         self.mu = clf.centers 
         self.pi = [np.sum(clf.labels==j)/data.shape[0] for j in range(self.k)]
         self.q = np.zeros((data.shape[0],self.k))
         for index, label in np.ndenumerate(clf.labels):
             self.q[index,int(label)] = 1
     self.sigma = np.zeros((self.k,data.shape[1],data.shape[1]))
     if self.format_covariance == 'isotropic' :
         for j in range(self.k) : 
             sigma_squared = sum([self.q[i,j]*np.dot(x_i-self.mu[j, :], x_i-self.mu[j, :]) for (i,x_i) in enumerate(data)])/(2*np.sum(self.q[:, j])) 
             self.sigma[j] = sigma_squared * np.identity(data.shape[1])
     elif self.format_covariance == 'general' :
         for j in range(self.k) :
             mu_j = self.mu[j, :].reshape((-1, 1))
             self.sigma[j] = sum([self.q[i,j]*(x_i.reshape((-1,1))-mu_j).dot(x_i.reshape((-1,1)).T-mu_j.T) for (i,x_i) in enumerate(data)])/np.sum(self.q[:, j])                
示例#3
0
import pandas as pd
from Kmeans import KMeans
from Plot import Plot

dataset = pd.read_csv('data/Mall_Customers.csv')
X = dataset.iloc[:, [3, 4]].values

clf = KMeans(k=5)
y_pred = clf.predict(X)

p = Plot()
p.plot_in_2d(X, y_pred, title="K-Means Clustering")
示例#4
0
文件: KmeansMain.py 项目: devekar/DM4
    matrix = []
    word_list = []
    topic_list = []
    place_list = []
    with open(filepath, 'rb') as csv_file:
        reader = csv.reader(csv_file, delimiter=',', quotechar='"')
        for row in reader:
            dataMatrix.append(row)

    for item in dataMatrix[1]:
        if "_" not in item:
            word_list.append(item)
        elif "t_" in item:
            topic_list.append(item[2:])
        elif "p_" in item:
            place_list.append(item[2:])

    word_list = word_list[1:] # Remove 'Article #'
    words_topics_size = len(topic_list) + len(word_list)

    for row in dataMatrix[2:]:
        matrix.append( [row[0]] + map(int, row[1:]) )
    return {"topic_list":topic_list, "word_list": word_list, "place_list":place_list, "matrix": matrix}


data = parseDM()
clusters = int(sys.argv[1])
dist_type = int(sys.argv[2])
kmeans = KMeans(clusters, dist_type, data)
kmeans.get_clusters()
示例#5
0
文件: KmeansMain.py 项目: devekar/DM5
        elif "t_" in item:
            topic_list.append(item[2:])
        elif "p_" in item:
            place_list.append(item[2:])

    word_list = word_list[1:] # Remove 'Article #'
    words_topics_size = len(topic_list) + len(word_list)

    for row in dataMatrix[2:]:
        matrix.append( [row[0]] + map(int, row[1:]) )
    return {"topic_list":topic_list, "word_list": word_list, "place_list":place_list, "matrix": matrix}


def write_clusters(clusters):
	file_name = "Kmeans_clusters_" + str(len(clusters)) + ".txt"
	f = open(file_name, "w")
	for cluster in clusters:
		f.write(' '.join(map(str,cluster)))
		f.write('\n')
	f.close()


''' Main '''

data = parseDM()
clusters = int(sys.argv[1])
dist_type = int(sys.argv[2])
kmeans = KMeans(clusters, dist_type, data)
clusters_ind = kmeans.get_clusters()
write_clusters(clusters_ind)
示例#6
0
from Kmeans import KMeans
import timeit
# from sklearn.cluster import KMeans

start = timeit.default_timer()

#Scikit - Learn
# km = KMeans()

# X, y = make_blobs(centers=3, n_samples=500, n_features=2, shuffle=True, random_state=17)
# print(X.shape)
# y_pred = km.fit_predict(X)
# # runs in 0.08sec

X, y = make_blobs(centers=3,
                  n_samples=500,
                  n_features=2,
                  shuffle=True,
                  random_state=17)
print(X.shape)

clusters = len(np.unique(y))
print(clusters)
k = KMeans(K=clusters, max_iters=150, plot_steps=False)
y_pred = k.fit_predict(X)
k.plot()
#runs in 1.8 seconds so not as effcient as kmean from sklearn but better than i thought

stop = timeit.default_timer()

print('Time: ', stop - start)
示例#7
0
        reader = csv.reader(csv_file, delimiter=',', quotechar='"')
        for row in reader:
            dataMatrix.append(row)

    for item in dataMatrix[1]:
        if "_" not in item:
            word_list.append(item)
        elif "t_" in item:
            topic_list.append(item[2:])
        elif "p_" in item:
            place_list.append(item[2:])

    word_list = word_list[1:]  # Remove 'Article #'
    words_topics_size = len(topic_list) + len(word_list)

    for row in dataMatrix[2:]:
        matrix.append([row[0]] + map(int, row[1:]))
    return {
        "topic_list": topic_list,
        "word_list": word_list,
        "place_list": place_list,
        "matrix": matrix
    }


data = parseDM()
clusters = int(sys.argv[1])
dist_type = int(sys.argv[2])
kmeans = KMeans(clusters, dist_type, data)
kmeans.get_clusters()