def test_kmeans_breastcancer(self): # python -m unittest tests_clustering.Tests_Clustering.test_kmeans_breastcancer from discomll.clustering import kmeans from sklearn.cluster import KMeans max_iter = 10 clusters = 2 random_seed = 2 x_train, _, x_test, _ = datasets.breastcancer_disc() train_data, test_data = datasets.breastcancer_disc_discomll() kmeans2 = KMeans(n_clusters=clusters, max_iter=max_iter, n_init=1, random_state=random_seed).fit(x_train) centroids1 = kmeans2.cluster_centers_ predictions1 = kmeans2.predict(x_test) centroids_url = kmeans.fit(train_data, n_clusters=clusters, max_iterations=max_iter, random_state=random_seed) predictions_url = kmeans.predict(test_data, centroids_url) predictions2 = [v[0] for k, v in result_iterator(predictions_url)] centroids2 = [v["x"] for k, v in result_iterator(centroids_url["kmeans_fitmodel"])] centroids2[0], centroids2[1] = centroids2[1], centroids2[0] self.assertTrue(np.allclose(centroids1, centroids2))
def test_kmeans_iris(self): # python -m unittest tests_clustering.Tests_Clustering.test_kmeans_iris from discomll.clustering import kmeans from sklearn.cluster import KMeans max_iter = 10 clusters = 3 random_seed = 0 x_train, y_train, x_test, y_test = datasets.iris() train_data, test_data = datasets.iris_discomll() sk_kmeans = KMeans(n_clusters=clusters, max_iter=max_iter, n_init=1, random_state=random_seed).fit(x_train) centroids1 = sk_kmeans.cluster_centers_ # predictions1 = sk_kmeans.predict(x_test) centroids_url = kmeans.fit(train_data, n_clusters=clusters, max_iterations=max_iter, random_state=random_seed) predictions_url = kmeans.predict(test_data, centroids_url) # predictions2 = [v[1] for k,v in result_iterator(predictions_url)] centroids2 = [v["x"] for k, v in result_iterator(centroids_url["kmeans_fitmodel"])] centroids2[0], centroids2[2] = centroids2[2], centroids2[0] self.assertTrue(np.allclose(centroids1, centroids2))
def test_kmeans_breastcancer(self): # python -m unittest tests_clustering.Tests_Clustering.test_kmeans_breastcancer from discomll.clustering import kmeans from sklearn.cluster import KMeans max_iter = 10 clusters = 2 random_seed = 2 x_train, _, x_test, _ = datasets.breastcancer_disc() train_data, test_data = datasets.breastcancer_disc_discomll() kmeans2 = KMeans(n_clusters=clusters, max_iter=max_iter, n_init=1, random_state=random_seed).fit(x_train) centroids1 = kmeans2.cluster_centers_ predictions1 = kmeans2.predict(x_test) centroids_url = kmeans.fit(train_data, n_clusters=clusters, max_iterations=max_iter, random_state=random_seed) predictions_url = kmeans.predict(test_data, centroids_url) predictions2 = [v[0] for k, v in result_iterator(predictions_url)] centroids2 = [ v["x"] for k, v in result_iterator(centroids_url["kmeans_fitmodel"]) ] centroids2[0], centroids2[1] = centroids2[1], centroids2[0] self.assertTrue(np.allclose(centroids1, centroids2))
def test_kmeans_iris(self): # python -m unittest tests_clustering.Tests_Clustering.test_kmeans_iris from discomll.clustering import kmeans from sklearn.cluster import KMeans max_iter = 10 clusters = 3 random_seed = 0 x_train, y_train, x_test, y_test = datasets.iris() train_data, test_data = datasets.iris_discomll() sk_kmeans = KMeans(n_clusters=clusters, max_iter=max_iter, n_init=1, random_state=random_seed).fit(x_train) centroids1 = sk_kmeans.cluster_centers_ # predictions1 = sk_kmeans.predict(x_test) centroids_url = kmeans.fit(train_data, n_clusters=clusters, max_iterations=max_iter, random_state=random_seed) predictions_url = kmeans.predict(test_data, centroids_url) # predictions2 = [v[1] for k,v in result_iterator(predictions_url)] centroids2 = [ v["x"] for k, v in result_iterator(centroids_url["kmeans_fitmodel"]) ] centroids2[0], centroids2[2] = centroids2[2], centroids2[0] self.assertTrue(np.allclose(centroids1, centroids2))
def kmeans_fit(input_dict): from discomll.clustering import kmeans fitmodel_url = kmeans.fit(input_dict["dataset"], n_clusters=input_dict["clusters"], max_iterations=input_dict["itr"], save_results=True) return {"fitmodel_url": fitmodel_url}
def kmeans_fit(input_dict): from discomll.clustering import kmeans fitmodel_url = kmeans.fit(input_dict["dataset"], n_clusters = input_dict["clusters"], max_iterations = input_dict["itr"], save_results = True) return {"fitmodel_url" : fitmodel_url}
def kmeans_fit(input_dict): from discomll.clustering import kmeans random_state = None if input_dict["seed"] == "None" else int(input_dict["seed"]) fitmodel_url = kmeans.fit(input_dict["dataset"], n_clusters=input_dict["clusters"], max_iterations=input_dict["itr"], random_state=random_state, save_results=True) return {"fitmodel_url": fitmodel_url}
def kmeans_fit(input_dict): from discomll.clustering import kmeans random_state = None if input_dict["seed"] == "None" else int( input_dict["seed"]) fitmodel_url = kmeans.fit(input_dict["dataset"], n_clusters=input_dict["clusters"], max_iterations=input_dict["itr"], random_state=random_state, save_results=True) return {"fitmodel_url": fitmodel_url}
from discomll.utils import model_view # define training dataset train = dataset.Data(data_tag=["test:breast_cancer_cont"], data_type="chunk", # define data source - chunk data on ddfs X_indices=xrange(0, 9), # define attribute indices y_index=9, # define class index delimiter=",") # define test dataset test = dataset.Data(data_tag=["test:breast_cancer_cont_test"], data_type="chunk", # define data source - chunk data on ddfs X_indices=xrange(0, 9), # define attribute indices y_index=9, # define class index delimiter=",") # fit model on training dataset fit_model = kmeans.fit(train, n_clusters=2, max_iterations=5, random_state=0) # output model model = model_view.output_model(fit_model) print model # predict test dataset predictions = kmeans.predict(test, fit_model) # output results for k, v in result_iterator(predictions): print k, v
from discomll import dataset from discomll.clustering import kmeans train = dataset.Data(data_tag=[ "http://ropot.ijs.si/data/linear/train/xaaaaa.gz", "http://ropot.ijs.si/data/linear/train/xaaabj.gz" ], data_type="gzip", generate_urls=True, X_indices=range(1, 22), id_index=0, delimiter=",") test = dataset.Data(data_tag=[ "http://ropot.ijs.si/data/linear/test/xaaaaa.gz", "http://ropot.ijs.si/data/linear/test/xaaabj.gz" ], data_type="gzip", generate_urls=True, X_indices=range(1, 22), id_index=0, delimiter=",") fit_model = kmeans.fit(train, n_clusters=5, max_iterations=10, random_state=0) predictions = kmeans.predict(test, fit_model) print predictions