Пример #1
0
        def select_next(iterval):
            """ select the next best data sample using robust map
            or simply the max iterval ... """

            if self._robust_map:
                k = np.argsort(iterval)[::-1]
                d_sub = self.data[:, k[:self._robust_nselect]]
                self.sub.extend(k[:self._robust_nselect])

                # cluster d_sub
                kmeans_mdl = Kmeans(d_sub, num_bases=self._robust_cluster)
                kmeans_mdl.factorize(niter=10)

                # get largest cluster
                h = np.histogram(kmeans_mdl.assigned,
                                 range(self._robust_cluster + 1))[0]
                largest_cluster = np.argmax(h)
                sel = pdist(
                    kmeans_mdl.W[:, largest_cluster:largest_cluster + 1],
                    d_sub)
                sel = k[np.argmin(sel)]
            else:
                sel = np.argmax(iterval)

            return sel
Пример #2
0
Файл: cnmf.py Проект: ririw/pymf
    def init_h(self):
        if not hasattr(self, "H"):
            # init basic matrices
            self.H = np.zeros((self._num_bases, self._num_samples))

            # initialize using k-means
            km = Kmeans(self.data[:, :], num_bases=self._num_bases)
            km.factorize(niter=10)
            assign = km.assigned

            num_i = np.zeros(self._num_bases)
            for i in range(self._num_bases):
                num_i[i] = len(np.where(assign == i)[0])

            self.H.T[range(len(assign)), assign] = 1.0
            self.H += 0.2 * np.ones((self._num_bases, self._num_samples))

        if not hasattr(self, "G"):
            self.G = np.zeros((self._num_samples, self._num_bases))

            self.G[range(len(assign)), assign] = 1.0
            self.G += 0.01
            self.G /= np.tile(np.reshape(num_i[assign], (-1, 1)), self.G.shape[1])

        if not hasattr(self, "W"):
            self.W = np.dot(self.data[:, :], self.G)
Пример #3
0
    def create_splits(self, X):
        # get shape of dataset
        N, D = X.shape

        # thresholds is set of K-Means of each feature
        self.thresholds = []

        for d in range(D):
            # reshape (n,) to (n,1)
            feature = X[:, d]
            feature = np.reshape(feature, [feature.size, 1])

            # Initialize K-Means model
            k_means = Kmeans(k=k)
            min_err = np.inf
            min_err_means = None

            for i in range(50):
                k_means.fit(feature)
                error = k_means.error(feature)
                if error < min_err:
                    min_err = error
                    min_err_means = k_means.means

            self.thresholds.append(min_err_means)
Пример #4
0
class BagOfWords:
    def __init__(self, local_feature_extractor_name='hog', nclusters=256):
        self.nclusters = nclusters
        if local_feature_extractor_name == 'hog':
            self.feature_extractor = HOGFeatureExtractor()
        else:
            raise Exception("Unknown feature extractor")
        self.kmeans = None

    def extract(self, X):
        assert X.ndim == 4
        return self.feature_extractor.predict(X, unflatten=True)

    def fit(self, X):
        assert X.ndim == 3
        X_features = X.reshape(X.shape[0] * X.shape[1], -1)
        self.kmeans = Kmeans(self.nclusters)
        self.kmeans.fit(X_features)

    def predict(self, X):
        assert X.ndim == 3
        X_features = X.reshape(X.shape[0] * X.shape[1], -1)
        X_clustered = self.kmeans.predict(X_features)
        X_clustered = X_clustered.reshape(X.shape[0], X.shape[1])
        ret = numpy.zeros((X.shape[0], self.nclusters))

        for i, x in enumerate(X_clustered):
            for word in x:
                ret[i, word] += 1

        return ret
    def quantize(self, img):
        """
        Quantizes an image into 2^b clusters

        Parameters
        ----------
        img : a (H,W,3) numpy array

        Returns
        -------
        quantized_img : a (H,W,1) numpy array containing cluster indices

        Stores
        ------
        colours : a (2^b, 3) numpy array, each row is a colour

        """

        H, W, _ = img.shape
        pixels = img.reshape((-1, 3))
        model = Kmeans(2**self.b)
        model.fit(pixels)
        quantized_img = model.predict(pixels).reshape((H, W, 1))
        self.colours = model.means

        return quantized_img
Пример #6
0
def main():
    '''Train tha model and evaluate performance'''

    '''Load the MNIST training data. Also flatten the images from 28X28 arrays to a single vector'''
    images, labels = amitgroup.io.mnist.load_mnist('training', path='./', asbytes=True)
    images = [image.ravel() for image in images]

    '''Find unique labels and which are the first images that correspnd to them'''
    indices = unique(labels, return_index=True)[1]

    '''Create the clustering engine. Use the unique images found above as centers'''
    clustering = Kmeans()
    clustering.train(data=images, centers=take(images, indices, axis=0), max_iterations=100)

    '''Load the testing data set and flatten the images'''
    test_images, test_labels = amitgroup.io.load_mnist('testing', path='./', asbytes=True)
    test_images = [image.ravel() for image in test_images]

    '''Assign the test data to clusters and evaluate the performance'''
    predictions = [clustering.cluster(image) for image in test_images]
    success = (predictions == test_labels)
    correct, counts = unique(success, return_counts=True)

    print('{} of the testing data set where put in the wrong cluster'.format(counts[0]))

    plot_images_separately([reshape(center, (28,28)) for center in clustering.centers])
Пример #7
0
def main():
    # Reading the training data
    path_train = './data/EMGaussian.data'
    path_test = './data/EMGaussian.test'
    data = dp.parse_data_wo_labels(path_train, 2, delimiter=' ')
    data_test = dp.parse_data_wo_labels(path_test, 2, delimiter=' ')

    # Initialization with K-means
    best_kmean_model = None
    min_distortion = float("inf")
    distortions = []
    for i in xrange(NB_INITIALIZATION_RETRIES):
        kmean_model = Kmeans(data, NB_CLUSTERS, MAX_K_MEAN_ITER)
        kmean_model.run()
        distortions.append(kmean_model.distortion)

        if kmean_model.distortion < min_distortion:
            best_kmean_model = kmean_model
            min_distortion = kmean_model.distortion
    # Showing the distortions
    plt.plot(range(1, NB_INITIALIZATION_RETRIES + 1), distortions)
    plt.xlabel("Initialization number")
    plt.ylabel("Distortion")
    plt.title("Running few Kmeans and measuring the distortion for each")
    plt.show()

    # Plotting the result
    best_kmean_model.plot()

    # Case where the covariance matrix is proportional to identity
    run_em_model(data, data_test, best_kmean_model, sigma_prop_identity=True)

    # General Case
    run_em_model(data, data_test, best_kmean_model, sigma_prop_identity=False)
Пример #8
0
def main():
    # 1.读取数据
    dataDF = getDF()

    # 2.测试最佳K值,第一次出现明显拐角处便是最佳K值
    km = Kmeans()
    km.searchK(SAVAPATH,dataDF,2,12)    # 查看保存的图片,选择最佳K值
Пример #9
0
    def init_h(self):
        if not hasattr(self, 'H'):
            # init basic matrices
            self.H = np.zeros((self._num_bases, self._num_samples))

            # initialize using k-means
            km = Kmeans(self.data[:, :],
                        num_bases=self._num_bases,
                        seed=self.seed)
            km.factorize(niter=10)
            assign = km.assigned

            num_i = np.zeros(self._num_bases)
            for i in range(self._num_bases):
                num_i[i] = len(np.where(assign == i)[0])

            self.H.T[range(len(assign)), assign] = 1.0
            self.H += 0.2 * np.ones((self._num_bases, self._num_samples))

        if not hasattr(self, 'G'):
            self.G = np.zeros((self._num_samples, self._num_bases))

            self.G[range(len(assign)), assign] = 1.0
            self.G += 0.01
            self.G /= np.tile(np.reshape(num_i[assign], (-1, 1)),
                              self.G.shape[1])

        if not hasattr(self, 'W'):
            self.W = np.dot(self.data[:, :], self.G)
Пример #10
0
 def kmeansselect(self):
         kmeans_mdl = Kmeans(self.data, num_bases=self._nsub)
         kmeans_mdl.initialization()
         kmeans_mdl.factorize()
         
         # pick data samples closest to the centres
         idx = dist.vq(kmeans_mdl.data, kmeans_mdl.W)            
         return idx
 def quantize(self, img):
     b = self.b
     C, R, D = img.shape
     self.img = img
     X = np.reshape(img, (C * R, D))
     model = Kmeans(k=pow(2, b))
     model.fit(X)
     self.model = model
     return model.means
Пример #12
0
class ClusteringTest(TestCase):
    def setUp(self):
        self.reader = Reader(filename)
        self.clusterer = Kmeans(3)

    def test_01_courses(self):
        courses = self.reader.courses  #returns list of courses
        self.assertEqual(courses[:3],
                         ['Bioinformatik', 'Informatik', 'Mathematik'])

    def test_02_normalize(self):
        word = "(Studienrichtung"
        normalized_word = self.reader.normalize_word(
            word)  #returns list of courses
        self.assertEqual(normalized_word, "studienrichtung")

    def test_03_vocabulary(self):
        words = self.reader.vocabulary
        self.assertEqual(words[:3], ['albanologie', 'allgemeine', 'als'])

    def test_04_vectorspaced(self):
        word_to_vectorspace = self.reader.vectorspaced("Slavische Philologie")
        vocab_size = len(self.reader.vocabulary)

        self.assertEqual(vocab_size, len(word_to_vectorspace))

        self.assertEqual(word_to_vectorspace, [
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0
        ])

    def test_05_distance(self):
        a = [1, 2, 3]
        b = [4, 5, 6]
        euclidean_dist = self.clusterer.distance(a, b)
        self.assertEqual(int(euclidean_dist), 5)

    def test_06_vector_mean(self):
        vectors = [[1, 2, 3], [4, 5, 6]]
        mean = self.clusterer.vector_mean(vectors)
        self.assertEqual(mean, [2.5, 3.5, 4.5])

    def test_07_classify(self):
        vectorspaced_data = self.reader.vector_spaced_data

        #clusters are always differrent
        self.clusterer.train(vectorspaced_data)
        clusters = [self.clusterer.classify(vec) for vec in vectorspaced_data]
        self.assertEqual(len(clusters), len(vectorspaced_data))
Пример #13
0
    def create_splits(self, X):
        model = Kmeans(3)
        N, D = X.shape

        splits = np.empty((D * model.k, ))

        for d in range(D):
            conSplit = X[:, d]
            conSplit = np.array(conSplit).transpose()
            model.fit(conSplit)
            for i in range(model.k):
                splits[d] = model.means[i, ]
        self.thresholds = np.unique(self.means)
    def quantize(self, img):
        """
        Quantizes an image into 2^b clusters

        Parameters
        ----------
        img : a (H,W,3) numpy array

        Returns
        -------
        quantized_img : a (H,W) numpy array containing cluster indices

        Stores
        ------
        colours : a (2^b, 3) numpy array, each row is a colour

        """

        H, W, D = img.shape
        # model = KMeans(n_clusters=2**self.b, n_init=3)
        model = Kmeans(k=2**self.b)
        X = np.reshape(img, (H * W, 3))
        model.fit(X)
        y = model.predict(X)
        print(y.shape)
        # self.y=y
        # self.center=model.means
        # Reshape 2D-matrix to 3D-img
        # quantized_img = img
        # X=np.reshape(img,(H*W,3))
        # model.fit(X)
        # y=model.predict(X)
        # m=y.shape
        # print(m)
        # quantized_img=y

        self.colours = np.zeros((2**self.b, 3), dtype='uint8')
        # ,dtype='uint8')
        for i in range(2**self.b):
            # img[i, :] = quantized_img[i]
            self.colours[i, :] = model.means[i, :]
        img = np.zeros((H * W), dtype='uint8')
        for i in range(H * W):
            img[i] = y[i]
        img = np.reshape(img, (H, W))
        quantized_img = img

        # TODO: fill in code here
        # raise NotImplementedError()

        return quantized_img
Пример #15
0
    def create_splits(self, X):
        #k value obtained via elbow method
        N, D = X.shape
        splits = []
        for i in range(D):
            model = Kmeans(k=10)
            #all values in an example

            vec = X[:, i].reshape(N, 1)
            model.fit(vec)
            threshs = model.means
            splits.append(np.squeeze(threshs))

        self.thresholds = splits
    def quantize_image(self, img):
        # w, h, d = img.shape
        w, h, d = original_shape = tuple(img.shape)

        resized_image = np.reshape(img, (w * h, d))

        model = Kmeans(k=(2**self.b))
        model.fit(resized_image)
        labels = model.predict(resized_image)
        self.means = getattr(model, "means")

        print("Cluster Assignments")
        print(labels)
        return labels
Пример #17
0
def main():

    KMEANS = leerModelo()
    if (KMEANS == None):
        print('NO EXISTE UN MODELO')
        spotify = SpotifyPro()
        df = spotify.iniciar(idPlaylist='1QP6tyANnZZ9bRTfQG4X7a')
        k = Kmeans(df)  # contiene red y datasets
        if (len(df)):
            k.importarDatos()
            if (k.red != None):
                guardarModelo(k)
    else:
        print('Ya existe un modelo')
    def dequantize_image(self, img):
        w, h, d = img.shape
        resized_image = np.reshape(img, (w * h, d))
        original_image = np.zeros(img.shape)

        model = Kmeans(k=(2**self.b))
        model.fit(resized_image)

        labels = model.predict(resized_image)
        self.means = getattr(model, "means")

        label_idx = 0
        for i in range(w):
            for j in range(h):
                original_image[i][j] = self.means[labels[label_idx]]
                label_idx += 1
        return original_image
Пример #19
0
def main():
    dataset1 = np.genfromtxt(r'../data/new_dataset_1.txt',
                             dtype=float,
                             delimiter='\t')
    dataset2 = np.genfromtxt(r'../data/cho.txt', dtype=float, delimiter='\t')

    km1 = Kmeans(dataset1[:, 2:], dataset1[:, 1], 3)
    km2 = Kmeans(dataset2[:, 2:], dataset2[:, 1], 10)

    ic1 = km1.initial_centroids(3, 5, 9)
    #ic1 = km1.initial_random_centroids(5)
    ic2 = km2.initial_random_centroids(5)
    # km1.centroids = km1.init_centroids = np.loadtxt(r'../log/cho_ground_centroids.txt')

    # specify iteration as parameter here
    km1.kmeans_algorithm()
    km2.kmeans_algorithm()

    extr_index_validation1 = ExternalIndex(km1.ground_truth_clusters,
                                           km1.clusters)
    extr_index_validation2 = ExternalIndex(km2.ground_truth_clusters,
                                           km2.clusters)

    print('Rand Index of dataset1 clusters :',
          extr_index_validation1.rand_index())
    print('Jaccard Coefficient of dataset1 clusters :',
          extr_index_validation1.jaccard_coefficient())

    print('Rand Index of dataset2 clusters :',
          extr_index_validation2.rand_index())
    print('Jaccard Coefficient of dataset2 dataset clusters :',
          extr_index_validation2.jaccard_coefficient())

    plot1 = Visualization(dataset1.data[:, 2:], km1.clusters, dataset1.data[:,
                                                                            1])
    plot2 = Visualization(dataset2.data[:, 2:], km2.clusters, dataset2.data[:,
                                                                            1])
    plot1.plot(r'../log/td1.jpg')
    plot2.plot(r'../log/cho2.jpg')

    # gene_cluster_matched = km1.cluster_validation()
    # print('Genes that matched in clusters: ', gene_cluster_matched)

    return
Пример #20
0
        def closure_1_3_1():
            k = 4
            best_model = None
            min_error = np.inf
            for i in range(50):
                model = Kmeans(k)
                model.fit(X)
                error = model.error(X)
                if error < min_error:
                    min_error = error
                    best_model = model

            plt.figure()
            utils.plot_2dclustering(X, best_model.predict(X))

            fname = os.path.join("..", "figs",
                                 "kmeans_outliers_best_model.png")
            plt.savefig(fname)
            print("\nFigure saved as '%s'" % fname)
Пример #21
0
    def _init_parameters(self, X, method='kmeans'):
        """
        初始化高斯分布的参数。
         如果 method == 'kmeans',那么使用kmeans进行初始化;
         如果 method == 'random',那么进行随机初始化。
        """
        n = X.shape[0]
        self.Guass = [Guass_distribution(dim=self.dim) for i in range(self.m)]

        if method is 'kmeans':
            try:
                kmeans = Kmeans()
                labels, centroids = kmeans.main(X,
                                                k=self.m,
                                                t=100,
                                                c_strategy='kmeans')
            except:
                centroids, labels = vq.kmeans2(X,
                                               self.m,
                                               minit='points',
                                               iter=1000)
            clusters = [[j for j in range(n) if labels[j] == i]
                        for i in range(self.m)]

        elif method is 'random':
            time_seed = int(time.time())
            np.random.seed(time_seed)
            clusters = [[] for i in range(self.m)]
            centroids = random.sample(list(range(n)), self.m)  # 随机生成m个中心

            for i in range(n):
                ci = np.argmin([la.norm(X[i] - X[c]) for c in centroids])
                clusters[ci].append(i)

        else:
            raise ValueError("Unknown method type!")

        for i in range(self.m):
            guass = self.Guass[i]
            data = X[clusters[i]]
            guass.init(data)
            guass.weight = len(clusters[i]) / n
Пример #22
0
def sliding_window_three_months(df, date):
    starting_date_obj = datetime.datetime.strptime(
        date, "%Y-%m-%d")
    preprocess_obj = preprocess()
    monthly_data = preprocess_obj.get_three_monthly_candlestick_data(
        df, date)
    kmeans = Kmeans(monthly_data)
    e = kmeans.get_clusters()
    print('original', e)
    ctut = []
    ctlt = []
    ctbl = []
    ctc = []
    for i in range(0, len(e)):
        ctut.append(e[i][0])
        ctlt.append(e[i][1])
        ctbl.append(e[i][2])
        ctc.append(e[i][3])
    candlestickst = candlestickState(ctut, ctlt, ctbl, ctc)
    return candlestickst
Пример #23
0
    def _init_kmeans(self):
        """
        Initialize using k-means.
        Uses random intialization for k-means. This is a really bad idea.
        """
        data = self.data
        k = self.k
        # Estimate the means of the mixture components, using k-means
        km = Kmeans(data, k)

        return km.cluster.T, km.label
Пример #24
0
    def initialization(self):
        # init basic matrices
        self.W = np.zeros((self._data_dimension, self._num_bases))
        self.H = np.zeros((self._num_bases, self._num_samples))
        self.G = np.zeros((self._num_samples, self._num_bases))
        #####

        # initialize using k-means
        km = Kmeans(self.data[:, :],
                    num_bases=self._num_bases,
                    show_progress=self._show_progress)
        km.initialization()
        km.factorize()
        assign = km.assigned

        num_i = np.zeros(self._num_bases)
        for i in range(self._num_bases):
            num_i[i] = len(np.where(assign == i)[0])

        self.G[range(len(assign)), assign] = 1.0
        self.G += 0.01
        self.G /= np.tile(np.reshape(num_i[assign], (-1, 1)), self.G.shape[1])

        self.H.T[range(len(assign)), assign] = 1.0
        self.H += 0.2 * np.ones((self._num_bases, self._num_samples))

        self.W = np.dot(self.data[:, :], self.G)
Пример #25
0
 def select_next(iterval):
     """ select the next best data sample using robust map
     or simply the max iterval ... """
 
     if self._robust_map:                
         k = np.argsort(iterval)[::-1]
         d_sub = self.data[:,k[:self._robust_nselect]]
         self.sub.extend(k[:self._robust_nselect])
         
         # cluster d_sub
         kmeans_mdl = Kmeans(d_sub, num_bases=self._robust_cluster)
         kmeans_mdl.factorize(niter=10)
         
         # get largest cluster
         h = np.histogram(kmeans_mdl.assigned, range(self._robust_cluster+1))[0]               
         largest_cluster = np.argmax(h)
         sel = pdist(kmeans_mdl.W[:, largest_cluster:largest_cluster+1], d_sub)               
         sel = k[np.argmin(sel)]
     else:
         sel = np.argmax(iterval)        
         
     return sel        
Пример #26
0
def kmeans(imagens, segmentadas,   path):

    k = Kmeans(imagens)
    qtd  = 0
    
    for i in imagens: 
        # Leitura Imagem
        img = imread(imagens[qtd][0]) 
        #img = cv2.resize(img, (segmentadas[qtd][2], segmentadas[qtd][1]))   

        res2 = k.kmeans_seg(img, 2) / 255
        res3 = k.kmeans_seg(img, 3) / 255
        res9 = k.kmeans_seg(img, 5) / 255
        cv2.imwrite("res2.png", res2)
        cv2.imwrite("res3.png", res3)
        cv2.imwrite("res9.png", res9)

        fig = plt.figure(figsize=(9,3), dpi=200)
        k.add_image(fig, img, 1, 4, 1, 'original')
        k.add_image(fig, res2, 1, 4, 2, 'k=2')
        k.add_image(fig, res3, 1, 4, 3, 'k=3')
        k.add_image(fig, res9, 1, 4, 4, 'k=5')
Пример #27
0
def main(_argv):
    probki_string, nazwy_atr, czy_atr_symb = wczytaj_baze_probek_z_tekstem(
        'spirala.txt', 'spirala-type.txt')
    probki = probki_str_na_liczby(probki_string, (0, 1))
    grupy, osrodki = Kmeans(probki, FLAGS.groups, FLAGS.iterations, progress)

    fig = plt.figure(1)
    anim = animation.FuncAnimation(fig,
                                   Animate,
                                   frames=len(progress),
                                   repeat=False,
                                   interval=500)
    chart.show()
Пример #28
0
    def kmeansselect(self):
        kmeans_mdl = Kmeans(self.data, num_bases=self._nsub)
        kmeans_mdl.initialization()
        kmeans_mdl.factorize()

        # pick data samples closest to the centres
        idx = dist.vq(kmeans_mdl.data, kmeans_mdl.W)
        return idx
    def quantize(self, img):
        """
        Quantizes an image into 2^b clusters

        Parameters
        ----------
        img : a (H,W,3) numpy array

        Returns
        -------
        quantized_img : a (H,W) numpy array containing cluster indices

        Stores
        ------
        colours : a (2^b, 3) numpy array, each row is a colour

        """

        H, W, _ = img.shape
        #print(img.shape)
        z = []
        #print(z)
        #print (img[0])

        for h in range(img.shape[0]):
            for w in range(img.shape[1]):
                z.append(img[h][w])

        #print(z)

        #print(len(img))
        #model = KMeans(n_clusters=2**self.b, n_init=3)
        model = Kmeans(k = 2**self.b)
        m = model.fit(np.array(z))
        y = model.predict(np.array(z))
        print('y =', y)
        print('means =', m)
Пример #30
0
    def quantize(self, X):
        N, D, C = X.shape

        X_reshaped = np.reshape(X, (N * D, C))
        print(X_reshaped)

        model = Kmeans(np.power(2, self.b))
        model.fit(X_reshaped)
        model.predict(X_reshaped)
        y = np.reshape(model.predict(X_reshaped), (N, D))
        self.means = model.means
        self.y = y
        self.X = X
Пример #31
0
        def closure_1_3_2():
            minErrs = []
            for k in range(1, 11):
                best_model = None
                min_error = np.inf
                for i in range(50):
                    model = Kmeans(k)
                    model.fit(X)
                    error = model.error(X)
                    if error < min_error:
                        min_error = error
                        best_model = model

                minErrs.append(min_error)

            plt.figure()
            plt.plot(list(range(1, 11)), minErrs)
            plt.xlabel('k')
            plt.ylabel('Error')
            plt.title('k-means training error as k increases')

            fname = os.path.join("..", "figs", "kmeans_err_k_outliers.png")
            plt.savefig(fname)
            print("\nFigure saved as '%s'" % fname)
Пример #32
0
def main():
    km = Kmeans(tc.init_board_gauss(nb_points, nb_classe, mini, maxi,
                                    ecart_min, ecart_max),
                nb_cluster=nb_cluster,
                cpu=cpu,
                methode_dist=methode_dist,
                adr=img_dir)
    km.run_global(choose_nb_graph=True, grphq=True)
    km.save(km_path)
    print("\n{}".format(km))
    return None
Пример #33
0
def main():
    
    
    df = bdd.date_dir(path)
    idx, mtx = bdd.df2np(df)
    del df
    
    km = Kmeans(mtx, nb_cluster=nb_cluster, cpu=cpu, methode_dist=methode_dist, adr=img_dir, index=idx)
    km.run_global(grphq=True, choose_nb_graph=True)
    
    km.save(km_path)
    print("\n{}".format(km))
    return None
Пример #34
0
def main():
    df = bdd.concat_dir(path)
    df = bdd.drop_profile(df, drop_var)
    df = bdd.bdd2bow(df)
    idx, mtx = bdd.df2np(df)
    col = df.columns.values.astype(str)
    del df
    
    km = Kmeans(mtx, nb_cluster=nb_cluster, cpu=cpu, methode_dist=methode_dist, adr=img_dir, index=idx)
    km.run_global(choose_nb_graph=True)
    bdd.print_means_words(km, col)
    km.save(km_path)
    print("\n{}".format(km))
    return None
Пример #35
0
from kmeans import Kmeans 
import numpy as np
data = np.array([[1.9,1.5,0.4,0.4,0.1,0.2,2.0,0.3,0.1],[2.3,2.5,0.2,1.8,0.1,1.8,2.5,1.5,0.3]])
codes = 3
km = Kmeans(data,codes)
print 'Class labels = ', km.label
print('Due to the random initialization, different (wrong) labels\nare often returned')    
x = np.array([0.25,2.0])
print km.classify(x)
#km.label = lab2
km.plot()
print('Verify the answer using the graph.')
# Specify the initial cluster means.        
codes =  np.array([data[:,0],data[:,2],data[:,3]]).T 
km = Kmeans(data,codes)  
print 'Clusters = ',km.cluster
print 'Class labels = ', km.label

Пример #36
0
            print 'Normalizing frequencies...',
            stdout.flush()
            # Don't modify the original set
            for i, doc in enumerate(parser.docset):
                normalize(doc, parser.words, idf)
                print i
            gc.collect()
            print 'done'

            for chooser in [choose_initial]: #choose_initial_pp, choose_initial:
                for k in [10]:
                    errors = []
                    print '\nStemming words: %s' % stem
                    print 'Using IDF: %s' % idf
                    print 'Running with %d centroids' % k
                    if chooser is choose_initial:
                        print 'Chooser: normal'
                    else:
                        print 'Chooser: plusplus'
                    stdout.flush()
                    for _ in xrange(13):
                        kmeans = Kmeans(parser.docset, k, distance,
                                        calc_centroid, chooser, tol=0.001)
                        clusters = get_clusters(kmeans.result(), parser.docset)
                        freqs = get_docs_frequencies(clusters)
                        errors.append(sum(calc_error(freqs)))
                    print 'Error mean: %d and median: %d' % \
                        (numpy.mean(errors), numpy.median(errors))
            gc.collect()