def test_random_guesses(self): data = g_mnemstudio_example_data nclusters = 3 #XXX copypasted # write an hdf file and create an associated data set name_hdf = get_tmp_filename() f_hdf = h5py.File(name_hdf) dset = f_hdf.create_dataset('testset', data=data) f_hdf.close() f_hdf = h5py.File(name_hdf, 'r') dset = f_hdf['testset'] #XXX copypasted # write a tabular text file and re-open the file name_stream = get_tmp_filename() np.savetxt(name_stream, data) f_stream = open(name_stream) # check for errors in the simplest invocations bigkmeans.kmeans(data, nclusters=nclusters) bigkmeans.kmeans_hdf(dset, nclusters=nclusters) bigkmeans.kmeans_ooc(f_stream, nclusters=nclusters) #XXX copypasted # close the hdf file and the tabular data file f_hdf.close() f_stream.close()
def test_cluster_loss_errors(self): # this is a data and guess where a cluster is lost eventually data = g_pathological_data guess = g_pathological_guess #XXX copypasted # write an hdf file and create an associated data set name_hdf = get_tmp_filename() f_hdf = h5py.File(name_hdf) dset = f_hdf.create_dataset('testset', data=data) f_hdf.close() f_hdf = h5py.File(name_hdf, 'r') dset = f_hdf['testset'] #XXX copypasted # write a tabular text file and re-open the file name_stream = get_tmp_filename() np.savetxt(name_stream, data) f_stream = open(name_stream) # The following three blocks should check errors. # numpy testing.assert_raises( bigkmeans.ClusterLossError, bigkmeans.kmeans, data, centroids=guess, on_cluster_loss=bigkmeans.error_on_cluster_loss, ) # hdf testing.assert_raises( bigkmeans.ClusterLossError, bigkmeans.kmeans_hdf, dset, centroids=guess, on_cluster_loss=bigkmeans.error_on_cluster_loss, ) # out-of-core stream testing.assert_raises( bigkmeans.ClusterLossError, bigkmeans.kmeans_ooc, f_stream, centroids=guess, on_cluster_loss=bigkmeans.error_on_cluster_loss, ) # Check that errors are not raised through these calls. # Although a large number of restarts may occur... benign_action_args = ( None, bigkmeans.ignore_cluster_loss, bigkmeans.return_on_cluster_loss, bigkmeans.retry_after_cluster_loss, ) for fn in benign_action_args: bigkmeans.kmeans(data, centroids=guess, on_cluster_loss=fn) bigkmeans.kmeans_hdf(dset, centroids=guess, on_cluster_loss=fn) bigkmeans.kmeans_ooc(f_stream, centroids=guess, on_cluster_loss=fn) # close the hdf file and the tabular data file f_hdf.close() f_stream.close()
def helper( self, data, maxiters=None, guess=None, nclusters=None, on_cluster_loss=None, ): # if no guess has been provided then we make a guess if guess is None: M, N = data.shape indices = sorted(random.sample(xrange(M), nclusters)) guess = data[indices, :] # write an hdf file and create an associated data set name_hdf = get_tmp_filename() f_hdf = h5py.File(name_hdf) dset = f_hdf.create_dataset('testset', data=data) f_hdf.close() f_hdf = h5py.File(name_hdf, 'r') dset = f_hdf['testset'] # write a tabular text file and re-open the file name_stream = get_tmp_filename() np.savetxt(name_stream, data) f_stream = open(name_stream) # check results for various vector quantization inner loops for fn_block_update in ( bigkmeans.lloyd.update_block_pyvqcore, bigkmeans.lloyd.update_block_scipy, bigkmeans.lloyd.update_block_python, ): # get the scipy kmeans results vq_final_clust, vq_labels = scipy_cluster.vq.kmeans2( data, guess, maxiters) # get the bigkmeans numpy results results = bigkmeans.kmeans( data, centroids=guess, maxiters=maxiters, on_cluster_loss=on_cluster_loss, fn_block_update=fn_block_update, ) np_init_clust, np_final_clust, np_labels = results # get the bigkmeans hdf results results = bigkmeans.kmeans_hdf( dset, centroids=guess, maxiters=maxiters, on_cluster_loss=on_cluster_loss, fn_block_update=fn_block_update, ) hdf_init_clust, hdf_final_clust, hdf_labels = results # get the bigkmeans tabular text-based out-of-core results results = bigkmeans.kmeans_ooc( f_stream, centroids=guess, maxiters=maxiters, on_cluster_loss=on_cluster_loss, fn_block_update=fn_block_update, ) ooc_init_clust, ooc_final_clust, ooc_labels = results # check that the outputs are the same for all methods for labels, final_clust in ( (np_labels, np_final_clust), (hdf_labels, hdf_final_clust), (ooc_labels, ooc_final_clust), ): testing.assert_allclose(vq_final_clust, final_clust) testing.assert_allclose(vq_labels, labels) # close the hdf file and the tabular data file f_hdf.close() f_stream.close()
def main(args): # Optionally read the initial centroids. guess = None if args.initial_centroids: guess = np.loadtxt(args.initial_centroids, dtype=float, ndmin=2) # Optionally specify an inner loop implementation choice. fn_block_update = None if args.inner_loop: inner_loop_dict = { 'pyvqcore' : bigkmeans.lloyd.update_block_pyvqcore, 'scipy' : bigkmeans.lloyd.update_block_scipy, 'python' : bigkmeans.lloyd.update_block_python, } fn_block_update = inner_loop_dict[args.inner_loop] # Open the data file and do the kmeans clustering. # Note that we deliberately disallow using stdin # because we require that the stream can be restarted # so that we can do one pass through the open file per iteration. if args.tabular_data_file: with open(args.tabular_data_file) as data_stream: guess, centroids, labels = bigkmeans.kmeans_ooc( data_stream, centroids=guess, nclusters=args.nclusters, on_cluster_loss=args.on_cluster_loss, maxiters=args.maxiters, maxrestarts=args.maxrestarts, fn_block_update=fn_block_update, verbose=args.verbose, ) elif args.hdf_data_file: if not h5py: raise ImportError( 'sorry I cannot deal with hdf5 data files ' 'unless the python package h5py is installed') if not args.hdf_dataset_name: raise Exception( 'If the data is in hdf format ' 'then an hdf dataset name (--hdf-dataset-name) ' 'must be specified ' 'in addition to the name of the hdf file. ' 'If you do not know the dataset name, ' 'then you can try to use the program called hdfview ' 'to search for your dataset within your hdf file.') f = h5py.File(args.hdf_data_file, 'r') dset = f[args.hdf_dataset_name] guess, centroids, labels = bigkmeans.kmeans_hdf( dset, centroids=guess, nclusters=args.nclusters, on_cluster_loss=args.on_cluster_loss, maxiters=args.maxiters, maxrestarts=args.maxrestarts, fn_block_update=fn_block_update, verbose=args.verbose, ) f.close() # write the labels to stdout or to a user-specified file if args.labels_out == '-': np.savetxt(sys.stdout, labels, '%d') elif args.labels_out: np.savetxt(args.labels_out, labels, '%d') # optionally write the centroids if args.centroids_out == '-': np.savetxt(sys.stdout, centroids) elif args.centroids_out: np.savetxt(args.centroids_out, centroids)