def cluster_cobweb3(data): """ cluster the data, using cobweb3""" npts = data.shape[0] n_channels = data.shape[1] # convert data from np array to list of dictionariesw data_new = [] for i in range(npts): pt = data[i, :] pt_dict = {dim_list[j]: pt[j] for j in range(n_channels)} data_new.append(pt_dict) # perform cobweb3 clustering and get labels print('starting cobweb3') print('note, this can take some time ...') start_time = time.time() tree = Cobweb3Tree() clusters = cluster(tree, data_new[:])[0] print('# points:', len(clusters)) clust_names = [c for c in set(clusters)] print(' cluster names:', clust_names) clust_dict = {c: idx for idx, c in enumerate(clust_names)} print(clust_dict) lbs = [clust_dict[c] for c in clusters] print('length of lbs:', len(lbs)) clust_dict = {c: idx for idx, c in enumerate(clust_names)} print(clust_dict) lbs = [clust_dict[c] for c in clusters] print('length of lbs:', len(lbs)) elapsed_time = time.time() - start_time print('done, elapsed mins:', np.round(elapsed_time / 60, 2)) # append labels to csv file of data lbs = np.asarray(lbs).reshape(len(lbs), 1) print(lbs.shape) new = np.concatenate((data, lbs), axis=1) print(new.shape) np.savetxt(file_location_out_data, new, delimiter=',') print('done with cluster_cobweb3') # main use of this function is to return the clusters, and the labels? return clusters, lbs
from random import shuffle from sklearn.metrics import adjusted_rand_score import matplotlib.pyplot as plt from concept_formation.trestle import TrestleTree from concept_formation.cluster import cluster from concept_formation.datasets import load_rb_wb_03 from concept_formation.structure_mapper import ObjectVariablizer towers = load_rb_wb_03() shuffle(towers) towers = towers[:30] variablizer = ObjectVariablizer() towers = [variablizer.transform(t) for t in towers] tree = TrestleTree() clusters = cluster(tree, towers, maxsplit=10) human_labels = [tower['_human_cluster_label'] for tower in towers] x = [num_splits for num_splits in range(1,len(clusters)+1)] y = [adjusted_rand_score(human_labels, split) for split in clusters] plt.plot(x, y, label="TRESTLE") plt.title("TRESTLE Clustering Accuracy (Given Human Ground Truth)") plt.ylabel("Adjusted Rand Index (Agreement Correcting for Chance)") plt.xlabel("# of Splits of Trestle Tree") plt.legend(loc=4) plt.show()
from concept_formation.cobweb import CobwebTree from concept_formation.cluster import cluster from concept_formation.datasets import load_mushroom seed(0) mushrooms = load_mushroom() shuffle(mushrooms) mushrooms = mushrooms[:150] tree = CobwebTree() mushrooms_no_class = [{ a: mushroom[a] for a in mushroom if a != 'classification' } for mushroom in mushrooms] clusters = cluster(tree, mushrooms_no_class)[0] mushroom_class = [ mushroom[a] for mushroom in mushrooms for a in mushroom if a == 'classification' ] ari = adjusted_rand_score(clusters, mushroom_class) dv = DictVectorizer(sparse=False) mushroom_X = dv.fit_transform(mushrooms_no_class) pca = PCA(n_components=2) mushroom_2d_x = pca.fit_transform(mushroom_X) colors = ['b', 'g', 'r', 'y', 'k', 'c', 'm'] clust_set = {v: i for i, v in enumerate(list(set(clusters)))} class_set = {v: i for i, v in enumerate(list(set(mushroom_class)))}
def run_clust_exp(nominal_noise=0, numeric_noise=0, scaling=False): data = [] for i in range(60): x = {} x['_label'] = "G1" if random() >= nominal_noise: x['f1'] = "G1f1" else: x['f1'] = choice(['G2f1', 'G3f1']) if random() >= nominal_noise: x['f2'] = choice(["G1f2a", "G1f2b"]) else: x['f2'] = choice(["G2f2a", "G2f2b", "G3f2a", "G3f2b"]) if random() >= numeric_noise: x['f3'] = np.random.normal(4, 1, 1)[0] else: x['f3'] = choice( [np.random.normal(10, 1, 1)[0], np.random.normal(16, 1, 1)[0]]) if random() >= numeric_noise: x['f4'] = np.random.normal(20, 2, 1)[0] else: x['f4'] = choice( [np.random.normal(32, 2, 1)[0], np.random.normal(44, 2, 1)[0]]) data.append(x) for i in range(60): x = {} x['_label'] = "G2" if random() >= nominal_noise: x['f1'] = "G2f1" else: x['f1'] = choice(["G2f1", "G3f1"]) if random() >= nominal_noise: x['f2'] = choice(["G2f2a", "G2f2b"]) else: x['f2'] = choice(["G1f2a", "G1f2b", "G3f2a", "G3f2b"]) if random() >= numeric_noise: x['f3'] = np.random.normal(10, 1, 1)[0] else: x['f3'] = choice( [np.random.normal(4, 1, 1)[0], np.random.normal(16, 1, 1)[0]]) if random() >= numeric_noise: x['f4'] = np.random.normal(32, 2, 1)[0] else: x['f4'] = choice( [np.random.normal(20, 2, 1)[0], np.random.normal(44, 2, 1)[0]]) data.append(x) for i in range(60): x = {} x['_label'] = "G3" if random() >= nominal_noise: x['f1'] = "G3f1" else: x['f1'] = choice(["G1f1", "G2f1"]) if random() >= nominal_noise: x['f2'] = choice(["G3f2a", "G3f2b"]) else: x['f2'] = choice(["G1f2a", "G1f2b", "G2f2a", "G2f2b"]) if random() >= numeric_noise: x['f3'] = np.random.normal(16, 1, 1)[0] else: x['f3'] = choice( [np.random.normal(4, 1, 1)[0], np.random.normal(10, 1, 1)[0]]) if random() >= numeric_noise: x['f4'] = np.random.normal(44, 2, 1)[0] else: x['f4'] = choice( [np.random.normal(20, 2, 1)[0], np.random.normal(32, 2, 1)[0]]) data.append(x) shuffle(data) t = Cobweb3Tree(scaling=scaling) clustering = cluster(t, data) return data, clustering[0]
from sklearn.metrics import adjusted_rand_score import matplotlib.pyplot as plt from concept_formation.trestle import TrestleTree from concept_formation.cluster import cluster from concept_formation.datasets import load_rb_wb_03 from concept_formation.preprocessor import ObjectVariablizer seed(0) towers = load_rb_wb_03() shuffle(towers) towers = towers[:60] variablizer = ObjectVariablizer() towers = [variablizer.transform(t) for t in towers] tree = TrestleTree() clusters = [c for c in cluster(tree, towers, maxsplit=10)] human_labels = [tower['_human_cluster_label'] for tower in towers] x = [num_splits for num_splits in range(1, len(clusters)+1)] y = [adjusted_rand_score(human_labels, split) for split in clusters] plt.plot(x, y, label="TRESTLE") plt.title("TRESTLE Clustering Accuracy (Given Human Ground Truth)") plt.ylabel("Adjusted Rand Index (Agreement Correcting for Chance)") plt.xlabel("# of Splits of Trestle Tree") plt.legend(loc=4) plt.show()
import matplotlib.pyplot as plt from sklearn.decomposition import PCA from sklearn.feature_extraction import DictVectorizer from sklearn.metrics import adjusted_rand_score from concept_formation.cobweb3 import Cobweb3Tree from concept_formation.cluster import cluster from concept_formation.datasets import load_iris irises = load_iris() shuffle(irises) tree = Cobweb3Tree() irises_no_class = [{a: iris[a] for a in iris if a != 'class'} for iris in irises] clusters = cluster(tree, irises_no_class)[0] iris_class = [iris[a] for iris in irises for a in iris if a == 'class'] ari = adjusted_rand_score(clusters, iris_class) dv = DictVectorizer(sparse=False) iris_X = dv.fit_transform([{a:iris[a] for a in iris if a != 'class'} for iris in irises]) pca = PCA(n_components=2) iris_2d_x = pca.fit_transform(iris_X) colors = ['b', 'g', 'r', 'y', 'k', 'c', 'm'] shapes = ['o', '^', '+'] clust_set = {v:i for i,v in enumerate(list(set(clusters)))} class_set = {v:i for i,v in enumerate(list(set(iris_class)))} for class_idx, class_label in enumerate(class_set): x = [v[0] for i,v in enumerate(iris_2d_x) if iris_class[i] == class_label]
# print(i) print("Continue clustering now...") seed(0) instances = _load_json(i) shuffle(instances) # invoking CobwebTree tree = CobwebTree() # The _features.json file contains one attribute called "classification" # This is the ground truth # In our case, we chosoe each CHAPTER name as the classification value # Thereby, similar instances (from the same chapter) shall be grouped together instances_no_class = [{a: instance[a] for a in instance if a != 'classification'} for instance in instances] clusters = cluster(tree, instances_no_class)[0] instance_class = [instance[a] for instance in instances for a in instance if a == 'classification'] ari = adjusted_rand_score(clusters, instance_class) dv = DictVectorizer(sparse=False) instance_X = dv.fit_transform(instances_no_class) # Prinicpal Component Analysis to reduce dimensionality pca = PCA(n_components=2) instance_2d_x = pca.fit_transform(instance_X) # Plotting the result colors=['dodgerblue','darkred','teal', "silver", "pink", "y"] clust_set = {v: i for i, v in enumerate(list(set(clusters)))} class_set = {v: i for i, v in enumerate(list(set(instance_class)))}
from sklearn.feature_extraction import DictVectorizer from sklearn.metrics import adjusted_rand_score from concept_formation.cobweb3 import Cobweb3Tree from concept_formation.cluster import cluster from concept_formation.datasets import load_iris seed(0) irises = load_iris() shuffle(irises) tree = Cobweb3Tree() irises_no_class = [{a: iris[a] for a in iris if a != 'class'} for iris in irises] clusters = next(cluster(tree, irises_no_class)) iris_class = [iris[a] for iris in irises for a in iris if a == 'class'] ari = adjusted_rand_score(clusters, iris_class) dv = DictVectorizer(sparse=False) iris_X = dv.fit_transform([{a: iris[a] for a in iris if a != 'class'} for iris in irises]) pca = PCA(n_components=2) iris_2d_x = pca.fit_transform(iris_X) colors = ['b', 'g', 'r', 'y', 'k', 'c', 'm'] shapes = ['o', '^', '+'] clust_set = {v: i for i, v in enumerate(list(set(clusters)))} class_set = {v: i for i, v in enumerate(list(set(iris_class)))}
from sklearn.metrics import adjusted_rand_score import matplotlib.pyplot as plt from concept_formation.trestle import TrestleTree from concept_formation.cluster import cluster from concept_formation.datasets import load_rb_wb_03 from concept_formation.preprocessor import ObjectVariablizer seed(0) towers = load_rb_wb_03() shuffle(towers) towers = towers[:60] variablizer = ObjectVariablizer() towers = [variablizer.transform(t) for t in towers] tree = TrestleTree() clusters = [c for c in cluster(tree, towers, maxsplit=10)] human_labels = [tower['_human_cluster_label'] for tower in towers] x = [num_splits for num_splits in range(1, len(clusters) + 1)] y = [adjusted_rand_score(human_labels, split) for split in clusters] plt.plot(x, y, label="TRESTLE") plt.title("TRESTLE Clustering Accuracy (Given Human Ground Truth)") plt.ylabel("Adjusted Rand Index (Agreement Correcting for Chance)") plt.xlabel("# of Splits of Trestle Tree") plt.legend(loc=4) plt.show()
from sklearn.metrics import adjusted_rand_score from concept_formation.cobweb import CobwebTree from concept_formation.cluster import cluster from concept_formation.datasets import load_mushroom seed(0) mushrooms = load_mushroom() shuffle(mushrooms) mushrooms = mushrooms[:150] tree = CobwebTree() mushrooms_no_class = [{a: mushroom[a] for a in mushroom if a != 'classification'} for mushroom in mushrooms] clusters = next(cluster(tree, mushrooms_no_class)) mushroom_class = [mushroom[a] for mushroom in mushrooms for a in mushroom if a == 'classification'] ari = adjusted_rand_score(clusters, mushroom_class) dv = DictVectorizer(sparse=False) mushroom_X = dv.fit_transform(mushrooms_no_class) pca = PCA(n_components=2) mushroom_2d_x = pca.fit_transform(mushroom_X) colors = ['b', 'g', 'r', 'y', 'k', 'c', 'm'] clust_set = {v: i for i, v in enumerate(list(set(clusters)))} class_set = {v: i for i, v in enumerate(list(set(mushroom_class)))}
from sklearn.metrics import adjusted_rand_score import matplotlib.pyplot as plt from concept_formation.trestle import TrestleTree from concept_formation.cluster import cluster from concept_formation.datasets import load_rb_wb_03 from concept_formation.preprocessor import ObjectVariablizer seed(0) towers = load_rb_wb_03() shuffle(towers) towers = towers[:60] variablizer = ObjectVariablizer() towers = [variablizer.transform(t) for t in towers] tree = TrestleTree() clusters = cluster(tree, towers, maxsplit=10) human_labels = [tower['_human_cluster_label'] for tower in towers] x = [num_splits for num_splits in range(1, len(clusters) + 1)] y = [adjusted_rand_score(human_labels, split) for split in clusters] plt.plot(x, y, label="TRESTLE") plt.title("TRESTLE Clustering Accuracy (Given Human Ground Truth)") plt.ylabel("Adjusted Rand Index (Agreement Correcting for Chance)") plt.xlabel("# of Splits of Trestle Tree") plt.legend(loc=4) plt.show()
from sklearn.decomposition import PCA from sklearn.feature_extraction import DictVectorizer from sklearn.metrics import adjusted_rand_score from concept_formation.cobweb import CobwebTree from concept_formation.cluster import cluster from concept_formation.datasets import load_mushroom mushrooms = load_mushroom() shuffle(mushrooms) mushrooms = mushrooms[:150] tree = CobwebTree() mushrooms_no_class = [{a: mushroom[a] for a in mushroom if a != 'classification'} for mushroom in mushrooms] clusters = cluster(tree, mushrooms_no_class)[0] mushroom_class = [mushroom[a] for mushroom in mushrooms for a in mushroom if a == 'classification'] ari = adjusted_rand_score(clusters, mushroom_class) dv = DictVectorizer(sparse=False) mushroom_X = dv.fit_transform(mushrooms_no_class) pca = PCA(n_components=2) mushroom_2d_x = pca.fit_transform(mushroom_X) colors = ['b', 'g', 'r', 'y', 'k', 'c', 'm'] clust_set = {v:i for i,v in enumerate(list(set(clusters)))} class_set = {v:i for i,v in enumerate(list(set(mushroom_class)))} for class_idx, class_label in enumerate(class_set):
def run_clust_exp(nominal_noise=0, numeric_noise=0, scaling=False): data = [] for i in range(60): x = {} x['_label'] = "G1" if random() >= nominal_noise: x['f1'] = "G1f1" else: x['f1'] = choice(['G2f1', 'G3f1']) if random() >= nominal_noise: x['f2'] = choice(["G1f2a", "G1f2b"]) else: x['f2'] = choice(["G2f2a", "G2f2b", "G3f2a", "G3f2b"]) if random() >= numeric_noise: x['f3'] = np.random.normal(4, 1, 1)[0] else: x['f3'] = choice([np.random.normal(10, 1, 1)[0], np.random.normal(16, 1, 1)[0]]) if random() >= numeric_noise: x['f4'] = np.random.normal(20, 2, 1)[0] else: x['f4'] = choice([np.random.normal(32, 2, 1)[0], np.random.normal(44, 2, 1)[0]]) data.append(x) for i in range(60): x = {} x['_label'] = "G2" if random() >= nominal_noise: x['f1'] = "G2f1" else: x['f1'] = choice(["G2f1", "G3f1"]) if random() >= nominal_noise: x['f2'] = choice(["G2f2a", "G2f2b"]) else: x['f2'] = choice(["G1f2a", "G1f2b", "G3f2a", "G3f2b"]) if random() >= numeric_noise: x['f3'] = np.random.normal(10, 1, 1)[0] else: x['f3'] = choice([np.random.normal(4, 1, 1)[0], np.random.normal(16, 1, 1)[0]]) if random() >= numeric_noise: x['f4'] = np.random.normal(32, 2, 1)[0] else: x['f4'] = choice([np.random.normal(20, 2, 1)[0], np.random.normal(44, 2, 1)[0]]) data.append(x) for i in range(60): x = {} x['_label'] = "G3" if random() >= nominal_noise: x['f1'] = "G3f1" else: x['f1'] = choice(["G1f1", "G2f1"]) if random() >= nominal_noise: x['f2'] = choice(["G3f2a", "G3f2b"]) else: x['f2'] = choice(["G1f2a", "G1f2b", "G2f2a", "G2f2b"]) if random() >= numeric_noise: x['f3'] = np.random.normal(16, 1, 1)[0] else: x['f3'] = choice([np.random.normal(4, 1, 1)[0], np.random.normal(10, 1, 1)[0]]) if random() >= numeric_noise: x['f4'] = np.random.normal(44, 2, 1)[0] else: x['f4'] = choice([np.random.normal(20, 2, 1)[0], np.random.normal(32, 2, 1)[0]]) data.append(x) shuffle(data) t = Cobweb3Tree(scaling=scaling) clustering = cluster(t, data) return data, next(clustering)
from sklearn.feature_extraction import DictVectorizer from sklearn.metrics import adjusted_rand_score from concept_formation.cobweb3 import Cobweb3Tree from concept_formation.cluster import cluster from concept_formation.datasets import load_iris seed(0) irises = load_iris() shuffle(irises) tree = Cobweb3Tree() irises_no_class = [{a: iris[a] for a in iris if a != 'class'} for iris in irises] clusters = next(cluster(tree, irises_no_class)) iris_class = [iris[a] for iris in irises for a in iris if a == 'class'] ari = adjusted_rand_score(clusters, iris_class) dv = DictVectorizer(sparse=False) iris_X = dv.fit_transform( [{a: iris[a] for a in iris if a != 'class'} for iris in irises]) pca = PCA(n_components=2) iris_2d_x = pca.fit_transform(iris_X) colors = ['b', 'g', 'r', 'y', 'k', 'c', 'm'] shapes = ['o', '^', '+'] clust_set = {v: i for i, v in enumerate(list(set(clusters)))} class_set = {v: i for i, v in enumerate(list(set(iris_class)))}