Exemplo n.º 1
0
def cluster_cobweb3(data):
    """ cluster the data, using cobweb3"""

    npts = data.shape[0]
    n_channels = data.shape[1]

    # convert data from np array to list of dictionariesw
    data_new = []
    for i in range(npts):
        pt = data[i, :]
        pt_dict = {dim_list[j]: pt[j] for j in range(n_channels)}
        data_new.append(pt_dict)

    # perform cobweb3 clustering and get labels

    print('starting cobweb3')
    print('note, this can take some time ...')
    start_time = time.time()

    tree = Cobweb3Tree()

    clusters = cluster(tree, data_new[:])[0]
    print('# points:', len(clusters))

    clust_names = [c for c in set(clusters)]
    print('  cluster names:', clust_names)

    clust_dict = {c: idx for idx, c in enumerate(clust_names)}
    print(clust_dict)
    lbs = [clust_dict[c] for c in clusters]
    print('length of lbs:', len(lbs))

    clust_dict = {c: idx for idx, c in enumerate(clust_names)}
    print(clust_dict)
    lbs = [clust_dict[c] for c in clusters]
    print('length of lbs:', len(lbs))

    elapsed_time = time.time() - start_time

    print('done, elapsed mins:', np.round(elapsed_time / 60, 2))

    # append labels to csv file of data
    lbs = np.asarray(lbs).reshape(len(lbs), 1)
    print(lbs.shape)
    new = np.concatenate((data, lbs), axis=1)
    print(new.shape)

    np.savetxt(file_location_out_data, new, delimiter=',')

    print('done with cluster_cobweb3')

    # main use of this function is to return the clusters, and the labels?
    return clusters, lbs
from random import shuffle

from sklearn.metrics import adjusted_rand_score
import matplotlib.pyplot as plt

from concept_formation.trestle import TrestleTree
from concept_formation.cluster import cluster
from concept_formation.datasets import load_rb_wb_03
from concept_formation.structure_mapper import ObjectVariablizer

towers = load_rb_wb_03()
shuffle(towers)
towers = towers[:30]

variablizer = ObjectVariablizer()
towers = [variablizer.transform(t) for t in towers]

tree = TrestleTree()
clusters = cluster(tree, towers, maxsplit=10)
human_labels = [tower['_human_cluster_label'] for tower in towers]

x = [num_splits for num_splits in range(1,len(clusters)+1)]
y = [adjusted_rand_score(human_labels, split) for split in clusters]
plt.plot(x, y, label="TRESTLE")

plt.title("TRESTLE Clustering Accuracy (Given Human Ground Truth)")
plt.ylabel("Adjusted Rand Index (Agreement Correcting for Chance)")
plt.xlabel("# of Splits of Trestle Tree")
plt.legend(loc=4)
plt.show()
Exemplo n.º 3
0
from concept_formation.cobweb import CobwebTree
from concept_formation.cluster import cluster
from concept_formation.datasets import load_mushroom

seed(0)
mushrooms = load_mushroom()
shuffle(mushrooms)
mushrooms = mushrooms[:150]

tree = CobwebTree()
mushrooms_no_class = [{
    a: mushroom[a]
    for a in mushroom if a != 'classification'
} for mushroom in mushrooms]
clusters = cluster(tree, mushrooms_no_class)[0]
mushroom_class = [
    mushroom[a] for mushroom in mushrooms for a in mushroom
    if a == 'classification'
]
ari = adjusted_rand_score(clusters, mushroom_class)

dv = DictVectorizer(sparse=False)
mushroom_X = dv.fit_transform(mushrooms_no_class)

pca = PCA(n_components=2)
mushroom_2d_x = pca.fit_transform(mushroom_X)

colors = ['b', 'g', 'r', 'y', 'k', 'c', 'm']
clust_set = {v: i for i, v in enumerate(list(set(clusters)))}
class_set = {v: i for i, v in enumerate(list(set(mushroom_class)))}
Exemplo n.º 4
0
def run_clust_exp(nominal_noise=0, numeric_noise=0, scaling=False):
    data = []

    for i in range(60):
        x = {}
        x['_label'] = "G1"

        if random() >= nominal_noise:
            x['f1'] = "G1f1"
        else:
            x['f1'] = choice(['G2f1', 'G3f1'])

        if random() >= nominal_noise:
            x['f2'] = choice(["G1f2a", "G1f2b"])
        else:
            x['f2'] = choice(["G2f2a", "G2f2b", "G3f2a", "G3f2b"])

        if random() >= numeric_noise:
            x['f3'] = np.random.normal(4, 1, 1)[0]
        else:
            x['f3'] = choice(
                [np.random.normal(10, 1, 1)[0],
                 np.random.normal(16, 1, 1)[0]])

        if random() >= numeric_noise:
            x['f4'] = np.random.normal(20, 2, 1)[0]
        else:
            x['f4'] = choice(
                [np.random.normal(32, 2, 1)[0],
                 np.random.normal(44, 2, 1)[0]])

        data.append(x)

    for i in range(60):
        x = {}
        x['_label'] = "G2"

        if random() >= nominal_noise:
            x['f1'] = "G2f1"
        else:
            x['f1'] = choice(["G2f1", "G3f1"])

        if random() >= nominal_noise:
            x['f2'] = choice(["G2f2a", "G2f2b"])
        else:
            x['f2'] = choice(["G1f2a", "G1f2b", "G3f2a", "G3f2b"])

        if random() >= numeric_noise:
            x['f3'] = np.random.normal(10, 1, 1)[0]
        else:
            x['f3'] = choice(
                [np.random.normal(4, 1, 1)[0],
                 np.random.normal(16, 1, 1)[0]])

        if random() >= numeric_noise:
            x['f4'] = np.random.normal(32, 2, 1)[0]
        else:
            x['f4'] = choice(
                [np.random.normal(20, 2, 1)[0],
                 np.random.normal(44, 2, 1)[0]])

        data.append(x)

    for i in range(60):
        x = {}
        x['_label'] = "G3"

        if random() >= nominal_noise:
            x['f1'] = "G3f1"
        else:
            x['f1'] = choice(["G1f1", "G2f1"])

        if random() >= nominal_noise:
            x['f2'] = choice(["G3f2a", "G3f2b"])
        else:
            x['f2'] = choice(["G1f2a", "G1f2b", "G2f2a", "G2f2b"])

        if random() >= numeric_noise:
            x['f3'] = np.random.normal(16, 1, 1)[0]
        else:
            x['f3'] = choice(
                [np.random.normal(4, 1, 1)[0],
                 np.random.normal(10, 1, 1)[0]])

        if random() >= numeric_noise:
            x['f4'] = np.random.normal(44, 2, 1)[0]
        else:
            x['f4'] = choice(
                [np.random.normal(20, 2, 1)[0],
                 np.random.normal(32, 2, 1)[0]])

        data.append(x)

    shuffle(data)
    t = Cobweb3Tree(scaling=scaling)
    clustering = cluster(t, data)
    return data, clustering[0]
from sklearn.metrics import adjusted_rand_score
import matplotlib.pyplot as plt

from concept_formation.trestle import TrestleTree
from concept_formation.cluster import cluster
from concept_formation.datasets import load_rb_wb_03
from concept_formation.preprocessor import ObjectVariablizer

seed(0)

towers = load_rb_wb_03()
shuffle(towers)
towers = towers[:60]

variablizer = ObjectVariablizer()
towers = [variablizer.transform(t) for t in towers]

tree = TrestleTree()
clusters = [c for c in cluster(tree, towers, maxsplit=10)]
human_labels = [tower['_human_cluster_label'] for tower in towers]

x = [num_splits for num_splits in range(1, len(clusters)+1)]
y = [adjusted_rand_score(human_labels, split) for split in clusters]
plt.plot(x, y, label="TRESTLE")

plt.title("TRESTLE Clustering Accuracy (Given Human Ground Truth)")
plt.ylabel("Adjusted Rand Index (Agreement Correcting for Chance)")
plt.xlabel("# of Splits of Trestle Tree")
plt.legend(loc=4)
plt.show()
Exemplo n.º 6
0
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import adjusted_rand_score

from concept_formation.cobweb3 import Cobweb3Tree
from concept_formation.cluster import cluster
from concept_formation.datasets import load_iris

irises = load_iris()
shuffle(irises)

tree = Cobweb3Tree()
irises_no_class = [{a: iris[a] for a in iris if a != 'class'} for iris in irises]
clusters = cluster(tree, irises_no_class)[0] 
iris_class = [iris[a] for iris in irises for a in iris if a == 'class']
ari = adjusted_rand_score(clusters, iris_class)

dv = DictVectorizer(sparse=False)
iris_X = dv.fit_transform([{a:iris[a] for a in iris if a != 'class'} for iris in irises])
pca = PCA(n_components=2)
iris_2d_x = pca.fit_transform(iris_X)

colors = ['b', 'g', 'r', 'y', 'k', 'c', 'm']
shapes = ['o', '^', '+']
clust_set = {v:i for i,v in enumerate(list(set(clusters)))}
class_set = {v:i for i,v in enumerate(list(set(iris_class)))}

for class_idx, class_label in enumerate(class_set):
    x = [v[0] for i,v in enumerate(iris_2d_x) if iris_class[i] == class_label]
Exemplo n.º 7
0
        # print(i)
        print("Continue clustering now...")
        seed(0)
        instances = _load_json(i)
        shuffle(instances)

        # invoking CobwebTree
        tree = CobwebTree()

        # The _features.json file contains one attribute called "classification"
        # This is the ground truth
        # In our case, we chosoe each CHAPTER name as the classification value
        # Thereby, similar instances (from the same chapter) shall be grouped together
        instances_no_class = [{a: instance[a] for a in instance
                               if a != 'classification'} for instance in instances]
        clusters = cluster(tree, instances_no_class)[0]
        instance_class = [instance[a] for instance in instances for a in instance
                          if a == 'classification']
        ari = adjusted_rand_score(clusters, instance_class)

        dv = DictVectorizer(sparse=False)
        instance_X = dv.fit_transform(instances_no_class)

        # Prinicpal Component Analysis to reduce dimensionality
        pca = PCA(n_components=2)
        instance_2d_x = pca.fit_transform(instance_X)

        # Plotting the result
        colors=['dodgerblue','darkred','teal', "silver", "pink", "y"]
        clust_set = {v: i for i, v in enumerate(list(set(clusters)))}
        class_set = {v: i for i, v in enumerate(list(set(instance_class)))}
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import adjusted_rand_score

from concept_formation.cobweb3 import Cobweb3Tree
from concept_formation.cluster import cluster
from concept_formation.datasets import load_iris

seed(0)
irises = load_iris()
shuffle(irises)

tree = Cobweb3Tree()

irises_no_class = [{a: iris[a]
                    for a in iris if a != 'class'} for iris in irises]
clusters = next(cluster(tree, irises_no_class))

iris_class = [iris[a] for iris in irises for a in iris if a == 'class']
ari = adjusted_rand_score(clusters, iris_class)

dv = DictVectorizer(sparse=False)
iris_X = dv.fit_transform([{a: iris[a]
                            for a in iris if a != 'class'} for iris in irises])
pca = PCA(n_components=2)
iris_2d_x = pca.fit_transform(iris_X)

colors = ['b', 'g', 'r', 'y', 'k', 'c', 'm']
shapes = ['o', '^', '+']
clust_set = {v: i for i, v in enumerate(list(set(clusters)))}
class_set = {v: i for i, v in enumerate(list(set(iris_class)))}
from sklearn.metrics import adjusted_rand_score
import matplotlib.pyplot as plt

from concept_formation.trestle import TrestleTree
from concept_formation.cluster import cluster
from concept_formation.datasets import load_rb_wb_03
from concept_formation.preprocessor import ObjectVariablizer

seed(0)

towers = load_rb_wb_03()
shuffle(towers)
towers = towers[:60]

variablizer = ObjectVariablizer()
towers = [variablizer.transform(t) for t in towers]

tree = TrestleTree()
clusters = [c for c in cluster(tree, towers, maxsplit=10)]
human_labels = [tower['_human_cluster_label'] for tower in towers]

x = [num_splits for num_splits in range(1, len(clusters) + 1)]
y = [adjusted_rand_score(human_labels, split) for split in clusters]
plt.plot(x, y, label="TRESTLE")

plt.title("TRESTLE Clustering Accuracy (Given Human Ground Truth)")
plt.ylabel("Adjusted Rand Index (Agreement Correcting for Chance)")
plt.xlabel("# of Splits of Trestle Tree")
plt.legend(loc=4)
plt.show()
from sklearn.metrics import adjusted_rand_score

from concept_formation.cobweb import CobwebTree
from concept_formation.cluster import cluster
from concept_formation.datasets import load_mushroom

seed(0)
mushrooms = load_mushroom()
shuffle(mushrooms)
mushrooms = mushrooms[:150]

tree = CobwebTree()
mushrooms_no_class = [{a: mushroom[a] for a in mushroom
                       if a != 'classification'} for mushroom in mushrooms]

clusters = next(cluster(tree, mushrooms_no_class))

mushroom_class = [mushroom[a] for mushroom in mushrooms for a in mushroom
                  if a == 'classification']
ari = adjusted_rand_score(clusters, mushroom_class)

dv = DictVectorizer(sparse=False)
mushroom_X = dv.fit_transform(mushrooms_no_class)

pca = PCA(n_components=2)
mushroom_2d_x = pca.fit_transform(mushroom_X)

colors = ['b', 'g', 'r', 'y', 'k', 'c', 'm']
clust_set = {v: i for i, v in enumerate(list(set(clusters)))}
class_set = {v: i for i, v in enumerate(list(set(mushroom_class)))}
Exemplo n.º 11
0
from sklearn.metrics import adjusted_rand_score
import matplotlib.pyplot as plt

from concept_formation.trestle import TrestleTree
from concept_formation.cluster import cluster
from concept_formation.datasets import load_rb_wb_03
from concept_formation.preprocessor import ObjectVariablizer

seed(0)

towers = load_rb_wb_03()
shuffle(towers)
towers = towers[:60]

variablizer = ObjectVariablizer()
towers = [variablizer.transform(t) for t in towers]

tree = TrestleTree()
clusters = cluster(tree, towers, maxsplit=10)
human_labels = [tower['_human_cluster_label'] for tower in towers]

x = [num_splits for num_splits in range(1, len(clusters) + 1)]
y = [adjusted_rand_score(human_labels, split) for split in clusters]
plt.plot(x, y, label="TRESTLE")

plt.title("TRESTLE Clustering Accuracy (Given Human Ground Truth)")
plt.ylabel("Adjusted Rand Index (Agreement Correcting for Chance)")
plt.xlabel("# of Splits of Trestle Tree")
plt.legend(loc=4)
plt.show()
from sklearn.decomposition import PCA
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import adjusted_rand_score

from concept_formation.cobweb import CobwebTree
from concept_formation.cluster import cluster
from concept_formation.datasets import load_mushroom

mushrooms = load_mushroom()
shuffle(mushrooms)
mushrooms = mushrooms[:150]

tree = CobwebTree()
mushrooms_no_class = [{a: mushroom[a] for a in mushroom 
                       if a != 'classification'} for mushroom in mushrooms]
clusters = cluster(tree, mushrooms_no_class)[0] 
mushroom_class = [mushroom[a] for mushroom in mushrooms for a in mushroom
                  if a == 'classification']
ari = adjusted_rand_score(clusters, mushroom_class)

dv = DictVectorizer(sparse=False)
mushroom_X = dv.fit_transform(mushrooms_no_class)

pca = PCA(n_components=2)
mushroom_2d_x = pca.fit_transform(mushroom_X)

colors = ['b', 'g', 'r', 'y', 'k', 'c', 'm']
clust_set = {v:i for i,v in enumerate(list(set(clusters)))}
class_set = {v:i for i,v in enumerate(list(set(mushroom_class)))}

for class_idx, class_label in enumerate(class_set):
def run_clust_exp(nominal_noise=0, numeric_noise=0, scaling=False):
    data = []

    for i in range(60):
        x = {}
        x['_label'] = "G1"

        if random() >= nominal_noise:
            x['f1'] = "G1f1"
        else:
            x['f1'] = choice(['G2f1', 'G3f1'])

        if random() >= nominal_noise:
            x['f2'] = choice(["G1f2a", "G1f2b"])
        else:
            x['f2'] = choice(["G2f2a", "G2f2b", "G3f2a", "G3f2b"])

        if random() >= numeric_noise:
            x['f3'] = np.random.normal(4, 1, 1)[0]
        else:
            x['f3'] = choice([np.random.normal(10, 1, 1)[0],
                              np.random.normal(16, 1, 1)[0]])

        if random() >= numeric_noise:
            x['f4'] = np.random.normal(20, 2, 1)[0]
        else:
            x['f4'] = choice([np.random.normal(32, 2, 1)[0],
                              np.random.normal(44, 2, 1)[0]])

        data.append(x)

    for i in range(60):
        x = {}
        x['_label'] = "G2"

        if random() >= nominal_noise:
            x['f1'] = "G2f1"
        else:
            x['f1'] = choice(["G2f1", "G3f1"])

        if random() >= nominal_noise:
            x['f2'] = choice(["G2f2a", "G2f2b"])
        else:
            x['f2'] = choice(["G1f2a", "G1f2b", "G3f2a", "G3f2b"])

        if random() >= numeric_noise:
            x['f3'] = np.random.normal(10, 1, 1)[0]
        else:
            x['f3'] = choice([np.random.normal(4, 1, 1)[0],
                              np.random.normal(16, 1, 1)[0]])

        if random() >= numeric_noise:
            x['f4'] = np.random.normal(32, 2, 1)[0]
        else:
            x['f4'] = choice([np.random.normal(20, 2, 1)[0],
                              np.random.normal(44, 2, 1)[0]])

        data.append(x)

    for i in range(60):
        x = {}
        x['_label'] = "G3"

        if random() >= nominal_noise:
            x['f1'] = "G3f1"
        else:
            x['f1'] = choice(["G1f1", "G2f1"])

        if random() >= nominal_noise:
            x['f2'] = choice(["G3f2a", "G3f2b"])
        else:
            x['f2'] = choice(["G1f2a", "G1f2b", "G2f2a", "G2f2b"])

        if random() >= numeric_noise:
            x['f3'] = np.random.normal(16, 1, 1)[0]
        else:
            x['f3'] = choice([np.random.normal(4, 1, 1)[0],
                              np.random.normal(10, 1, 1)[0]])

        if random() >= numeric_noise:
            x['f4'] = np.random.normal(44, 2, 1)[0]
        else:
            x['f4'] = choice([np.random.normal(20, 2, 1)[0],
                              np.random.normal(32, 2, 1)[0]])

        data.append(x)

    shuffle(data)
    t = Cobweb3Tree(scaling=scaling)
    clustering = cluster(t, data)
    return data, next(clustering)
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import adjusted_rand_score

from concept_formation.cobweb3 import Cobweb3Tree
from concept_formation.cluster import cluster
from concept_formation.datasets import load_iris

seed(0)
irises = load_iris()
shuffle(irises)

tree = Cobweb3Tree()

irises_no_class = [{a: iris[a]
                    for a in iris if a != 'class'} for iris in irises]
clusters = next(cluster(tree, irises_no_class))

iris_class = [iris[a] for iris in irises for a in iris if a == 'class']
ari = adjusted_rand_score(clusters, iris_class)

dv = DictVectorizer(sparse=False)
iris_X = dv.fit_transform(
    [{a: iris[a] for a in iris if a != 'class'} for iris in irises])
pca = PCA(n_components=2)
iris_2d_x = pca.fit_transform(iris_X)

colors = ['b', 'g', 'r', 'y', 'k', 'c', 'm']
shapes = ['o', '^', '+']
clust_set = {v: i for i, v in enumerate(list(set(clusters)))}
class_set = {v: i for i, v in enumerate(list(set(iris_class)))}