def test_confidence_interval(self): data = [8.0, 7.0, 5.0, 9.0, 9.5, 11.3, 5.2, 8.5] self.assertAlmostEqual(1.6772263663789651, half_confidence_interval_size(data, 0.95), 5) data = [8.0, 7.0, 5.0, 9.0, 9.5, 11.3, 5.2, 8.5, 4.0, 7.4, 4.4, 9.0, 1.1, 0.0, 0.2, 9.5, 1.0, 2.0, 3.0, 4.0, 5.5, 8.2, 4.2, 4.5, 7.2, 7.0, 1.2, 5.3, 8.5, 1.3, 5.3, 9.5] self.assertAlmostEqual(1.4173919794304153, half_confidence_interval_size(data, 0.99), 5)
def test_confidence_interval_axis(self): data = [[8.0, 7.0, 5.0, 9.0, 9.5, 11.3, 5.2, 8.5], [8.0, 7.0, 5.0, 9.0, 9.5, 11.3, 5.2, 8.5]] assert_array_almost_equal([1.67722628, 1.67722628], half_confidence_interval_size(data, .95, axis=1)) assert_array_almost_equal([ 0., 0., 0., 0., 0., 0., 0., 0.], half_confidence_interval_size(data, .95, axis=0)) self.assertAlmostEqual(1.06902922476, half_confidence_interval_size(data, .95))
def kmeans_betacv(data, num_cluster, batch_kmeans=False, n_runs = 10, confidence = 0.90): ''' Computes the BetaCV for running Kmeans on the dataset. This method returns the BetaCV value and half of the size of the confidence interval for the same value (BetaCV is an average or the number of runs given). Arguments --------- data: matrix A matrix of observations. If this is sparse, `batch_kmeans` must be True num_cluster: int number of clusters to run k-means for batch_kmeans: bool (defauts to False) if `sklearn.cluster.MiniBatchKMeans` should be used. This is faster and suitable for sparse datasets, but less accurate. n_runs: int (default = 10) Number of runs to compute the BetaCV confidence: double [0, 1) (default = 0.9) The confidence used to compute half the confidence interval size Returns ------- The betacv and half of the confidence interval size ''' algorithm = None if not batch_kmeans: algorithm = KMeans(num_cluster) else: algorithm = MiniBatchKMeans(num_cluster) inter_array = np.zeros(n_runs) intra_array = np.zeros(n_runs) for i in xrange(n_runs): #Run K-Means algorithm.fit(data) centers = algorithm.cluster_centers_ labels = algorithm.labels_ #KMeans in sklearn uses euclidean dist_centers = pairwise.euclidean_distances(centers) #Inter distance mean_dist_between_centers = np.mean(dist_centers) inter_array[i] = mean_dist_between_centers #Intra distance dist_all_centers = algorithm.transform(data) intra_dists = [] for doc_id, cluster in enumerate(labels): dist = dist_all_centers[doc_id, cluster] intra_dists.append(dist) intra_array[i] = np.mean(intra_dists) betacv = intra_array / inter_array cinterval = half_confidence_interval_size(betacv, confidence) return np.mean(betacv), cinterval
def main(tcu_fpath): data = tcu_io.load_untreated_csv_to_numpy(tcu_fpath) data = data[data['Situacao'] == 'Aceito e Habilitado'] desc_column = data['Descricao'] des_cmp_column = data['DescricaoComplementar'] unidade_column = data['UnidadeFornecimento'] qtd_column = [str(qtd) for qtd in data['Quantidade']] #Transforms descriptions to base strings as_docs = [] for as_text in zip(desc_column, des_cmp_column, unidade_column, qtd_column): doc = " ".join(as_text) as_docs.append(doc) #Vectorizes to TF-IDF vectorizer = Vectorizer() doc_sparse_matrix = vectorizer.fit_transform(as_docs) #Compute clusters inter = {} intra = {} n_runs = 20 k_vals = range(2, 16) for i in xrange(n_runs): for k in k_vals: #Each K has n_runs clusterings inter_array = inter.setdefault(k, np.zeros(n_runs)) intra_array = intra.setdefault(k, np.zeros(n_runs)) #Run K-Means mbkm = MiniBatchKMeans(k, init = 'random') mbkm.fit(doc_sparse_matrix) centers = mbkm.cluster_centers_ labels = mbkm.labels_ #Inter distance. We use min because the ideia is to maximize this. #Min serves as a penalty for worse case. dist_centers = pairwise.euclidean_distances(centers) min_dist_between_centers = \ np.min(dist_centers[dist_centers > 0]) inter_array[i] = min_dist_between_centers #Intra distance dist_all_centers = mbkm.transform(doc_sparse_matrix) intra_dists = [] for doc_id, cluster in enumerate(labels): dist = dist_all_centers[doc_id, cluster] intra_dists.append(dist) intra_array[i] = np.mean(intra_dists) #Prints num elements per cluster print('Run %d ; k = %d' %(i, k)) counter = Counter(labels) for cluster, population in counter.items(): print('\tK = %d; Pop = %d' %(cluster, population)) print() x = inter.keys() y = [] c = [] for k in x: div = inter[k] / intra[k] y.append(np.mean(div)) c.append(half_confidence_interval_size(div, 0.90)) #hack for the zero to apper x = [0] + x y = [0] + y c = [0] + c ax = plt.gca() ax.set_yscale('log') ax.set_xticks(range(0, 16)) plt.ylabel('InterCluster/IntraCluster Ratio') plt.xlabel('Number of clusters') plt.errorbar(x, y, yerr=c, fmt='bo', markersize=8, elinewidth=2) plt.show()