Exemplo n.º 1
0
    def sm_training(self):
        """
        Train the model with different parameters.
        """
        file=askopenfilename(initialdir=dir_name, title="Select Data",
                             filetypes=[("csv files", "*.csv")])
        
        if file is None:
            tk.messagebox.showerror("Error","your chosen file is not valid. \n Please choose again.")

        content=open(file, "rb")
        data=pd.read_csv(content)
        # ind=data[data.columns[0]]
        
        # data = data.set_index(ind)
        comp_names=[name for name in data.columns]
        index = data.index

        # test cali housing first
        df=data.fillna(0).values

        # initialize the build
        sm=SOMFactory().build(
            data=df,
            mapsize=(int(self.Mapsize_x.get()), int(self.Mapsize_y.get())),
            mask=None,
            mapshape='planar',
            lattice=self.Lattice_ent.get(),
            normalization=self.Normalization_ent.get(),
            initialization=self.Initialization_ent.get(),
            neighborhood='gaussian',
            training='batch',
            name='sompy',
            component_names=self.comp_names)

        # start training
        sm.train(n_job=int(self.n_job_ent.get()),
                shared_memory=self.shared_memory_ent.get(),
                verbose=self.verbose_ent.get(),
                train_rough_len=int(self.train_rough_len_ent.get()),
                train_rough_radiusin=int(self.train_rough_rin_ent.get()),
                train_rough_radiusfin=int(self.train_rough_rfin_ent.get()),
                train_finetune_len=int(self.train_ft_len_ent.get()),
                train_finetune_radiusin=int(self.train_ft_rin_ent.get()),
                train_finetune_radiusfin=int(self.train_ft_rfin_ent.get()),
                train_len_factor=int(self.train_len_factor_ent.get()),
                maxtrainlen=np.Inf)

        # errors calculation
        topographic_error=sm.calculate_topographic_error()
        quantitization_error=np.mean(sm._bmu[1])

        # if multiple runs are required
        # joblib.dump(sm, "model_{}.joblib".format(i))

        pickle.dump(sm, open("Models/sm_model", "wb"))

        # print errors on the cmd prompt
        print("the topographic error is %s " % topographic_error)
        print("the quantitization error is %s " % quantitization_error)
Exemplo n.º 2
0
 def __init__(self, df, mapsize, initialization='random'):
     """
     
     :param df:              数据框 
     :param mapsize:         输出层维度,一般为二维,输入(20,20)的形式
     :param initialization:  "PCA" 或 "random",初始化权重的方法
             - PCA是以变量的主成分值作为权重,见sompy.codebool.pca_linear_initialization
             - random是以随机数进行初始化
     """
     self.data = np.array(df)
     self.sm = SOMFactory().build(self.data,
                                  mapsize=mapsize,
                                  initialization=initialization,
                                  component_names=df.columns)
     self.train()
Exemplo n.º 3
0
    def build_som(self, X):
        print('Building SOM...')
        sm = SOMFactory().build(X,
                                normalization='var',
                                mapsize=(15, 15),
                                initialization='pca')
        sm.train(n_job=1,
                 verbose='info',
                 train_rough_len=200,
                 train_finetune_len=100)

        topographic_error = sm.calculate_topographic_error()
        quantization_error = np.mean(sm._bmu[1])
        print ("Topographic error = {}; Quantization error = {}"\
        .format(topographic_error,quantization_error))
        return sm
Exemplo n.º 4
0
def soms(data):
    """
    Input: 3-D array (nt,ny,nx)
    """
    #Reshape data
    nt, ny, nx = data.shape
    data = np.reshape(data, [nt, ny * nx], order='F')

    sm = SOMFactory().build(data,
                            mapsize=(5, 5),
                            normalization=None,
                            initialization='pca')
    sm.train(n_job=-1,
             verbose=False,
             train_rough_len=20,
             train_finetune_len=10)

    return sm
Exemplo n.º 5
0
def training_batched_som(map_min_size, map_max_size, nb_models, X_train):
    for i in range(nb_models):
        sm = SOMFactory().build(
            X_train,
            mapsize=[
                random.choice(list(range(map_min_size, map_max_size))),
                random.choice(list(range(map_min_size, map_max_size)))
            ],
            normalization='var',
            initialization='random',
            component_names=names,
            lattice="hexa")
        sm.train(n_job=1,
                 verbose=False,
                 train_rough_len=30,
                 train_finetune_len=100)
        joblib.dump(sm, path + "batched_model_{}.joblib".format(i))
        print("end of training model n°" + str(i))

    # Study the models trained and plot the errors obtained in order to select the best one
    models_pool = glob.glob(path + "batched_model*")
    errors = []
    for model_filepath in models_pool:
        sm = joblib.load(model_filepath)
        topographic_error = sm.calculate_topographic_error()
        quantization_error = sm.calculate_quantization_error()
        errors.append((topographic_error, quantization_error))
    e_top, e_q = zip(*errors)

    plt.scatter(e_top, e_q)
    plt.xlabel("Topographic error")
    plt.ylabel("Quantization error")
    plt.title("Topographic and quantization errors of the models trained")
    plt.show()
Exemplo n.º 6
0
    def _prediction(self):
        """SOM function"""
        try:
            data = np.loadtxt('/home/mininet/testmininet/trainingdata1.txt',
                              delimiter=',')
            names = [
                'Interval', 'Throughput(Mbits/0.5sec)', 'Bandwidth(Mbits/sec)',
                'Jitter(ms)', 'Loss', 'Decision'
            ]

            sm = SOMFactory().build(data,
                                    normalization='var',
                                    initialization='random',
                                    component_names=names)

            sm.train(n_job=1,
                     verbose='info',
                     train_rough_len=15,
                     train_finetune_len=15)

            topographic_error = sm.calculate_topographic_error()
            quantization_error = np.mean(sm._bmu[1])
            line = open('/home/mininet/testmininet/pdata1.txt').readlines()
            log.debug(line)
            comp = line[0].split(",")
            del comp[len(comp) - 1]
            data2 = np.array([[
                float(comp[0]),
                float(comp[1]),
                float(comp[2]),
                float(comp[3]),
                float(comp[4])
            ]])
            sm.cluster(5)
            pred = np.absolute(sm.predict_by(data2, 5))

            self.details.write(comp[4] + "\t" + comp[1] + "\t" + str(pred[0]) +
                               "\t" + str(topographic_error) + "\n")
            print(pred)
            if pred <= 0.5:
                print("No congestion")
                self._congdelay(pred)
            elif pred > 0.5:
                print("Congestion there for next 5 seconds atleast")

            self.prevpred = pred
        except IndexError:
            print("ERROR")
Exemplo n.º 7
0
def self_organizing_map(normalized_df,
                        normalization='var',
                        initialization='pca',
                        n_job=1,
                        train_rough_len=2,
                        train_finetune_len=5,
                        verbose=None):
    # create the SOM network and train it. You can experiment with different normalizations and initializations
    som = SOMFactory().build(normalized_df.values,
                             normalization=normalization,
                             initialization=initialization,
                             component_names=normalized_df.columns)
    som.train(n_job=n_job,
              train_rough_len=train_rough_len,
              train_finetune_len=train_finetune_len,
              verbose=verbose)

    # The quantization error: average distance between each data vector and its BMU.
    # The topographic error: the proportion of all data vectors for which first and second BMUs are not adjacent units.
    topographic_error = som.calculate_topographic_error()
    quantization_error = np.mean(som._bmu[1])
    print("Topographic error = %s; Quantization error = %s" %
          (topographic_error, quantization_error))
    return som
Exemplo n.º 8
0
def training_specific_som(map_x_size, map_y_size, X_train):
    sm = SOMFactory().build(X_train,
                            mapsize=[map_x_size, map_y_size],
                            normalization='var',
                            initialization='random',
                            component_names=names,
                            lattice='hexa')
    sm.train(n_job=1,
             verbose=False,
             train_rough_len=30,
             train_finetune_len=100)
    joblib.dump(sm, path + "batched_model_specific{}.joblib".format(0))
    print("Topographic error: " + str(sm.calculate_topographic_error()) +
          ", Quantization error: " + str(sm.calculate_quantization_error()) +
          "\n")
    return (sm)
Exemplo n.º 9
0
def cluster_category_data(df,
                          scale_data='minmax',
                          dim_red_method='som',
                          use_elbow_method='True',
                          cluster_method='hierarchical',
                          n_clusters=None,
                          verbose=1,
                          perplexity=None):
    """
    :param df: dataframe containing all the columns belonging to a category to be used in clustering
    :param scale_data: method to be used to scale the dataset
    :param dim_red_method: options are 'som', 'umap', 'tsne', None. If  None, do clustering directly.
    :param use_elbow_method: if True, elbow method is used to find the optimum number of clusters. If False, n_clusters needs to be specified
    :param cluster_method: options are 'kmeans' and 'hierarchical'. In either case kmeans is used for the elbow method(because of the time required).
    :param n_clusters: If use_elbow_method is False, n_clusters needs to be given.
    :param verbose: If True, output the progress in clustering process
    :param perplexity: If method used is TSNE, perplexity nedds to be specified
    """
    t = time.time()

    if scale_data == 'minmax':
        X = MinMaxScaler().fit_transform(df)
    elif scale_data == 'standard':
        X = StandardScaler().fit_transform(df)
    else:
        X = df.values

    if verbose:
        print(f'number of features = {df.shape[1]}')

    if dim_red_method == 'som':
        if verbose:
            print(
                'Self Organising Maps is being used for dimensionality reduction...'
            )
        opt_k = 2
        max_s = -1
        f = 0
        for mapsize in [(30, 30)]:
            if verbose:
                print(f'map size = {mapsize}')
            sm = SOMFactory().build(X,
                                    normalization='var',
                                    initialization='pca',
                                    mapsize=mapsize)
            sm.train(n_job=1,
                     verbose=False,
                     train_rough_len=100,
                     train_finetune_len=500)
            if use_elbow_method:
                model = KElbowVisualizer(KMeans(), k=20, timings=False)
                elbow = model.fit(sm.codebook.matrix).elbow_value_
                if elbow and verbose:
                    print(f'elbow value = {elbow}')
                if not elbow:
                    if verbose:
                        print('elbow not found')
                    ms = -1
                    for k in range(2, 20):
                        km_labels = KMeans(k).fit_predict(sm.codebook.matrix)
                        s = silhouette_score(sm.codebook.matrix, km_labels)
                        if s > ms:
                            elbow = k
            else:
                elbow = n_clusters
            x = sm.project_data(X)
            labels, _, _ = sm.cluster(opt=elbow, cl_type=cluster_method)
            clabels = []
            for i in range(X.shape[0]):
                clabels.append(labels[x[i]])
            s_score = silhouette_score(X, clabels)
            if verbose:
                print(f'silhouette score = {round(s_score, 3)}')
            max_s = max(s_score, max_s)
            if (max_s == s_score):
                opt_k = elbow
                opt_labels = clabels
                opt_size = mapsize
            if (max_s > s_score):
                break
        if verbose:
            print(f'optimum mapsize = {opt_size}')
            print(
                f'optimum number of clusters = {opt_k} & silhouette score = {round(max_s,3)}'
            )
            print(f'time taken = {round(time.time()-t,1)}')
        return opt_labels, opt_k

    elif dim_red_method:
        if dim_red_method == 'umap':
            print('UMAP is being used for dimensionality reduction...')
            embedding = umap.UMAP(n_components=2,
                                  n_neighbors=5,
                                  min_dist=0.0001,
                                  metric='euclidean',
                                  random_state=1,
                                  spread=0.5,
                                  n_epochs=1000).fit_transform(X)
            print('UMAP embedding done...')
        elif dim_red_method == 'tsne':
            print('t-SNE is being used for dimensionality reduction...')
            embedding = TSNE(perplexity=perplexity).fit_transform(X)
            print('t-SNE embedding is done...')
        if use_elbow_method:
            model = KElbowVisualizer(KMeans(), k=20, timings=False)
            elbow = model.fit(embedding).elbow_value_
        else:
            elbow = n_clusters
        if cluster_method == 'kmeans':
            opt_labels = KMeans(elbow).fit_predict(embedding)
        elif cluster_method == 'hierarchical':
            opt_labels = AgglomerativeClustering(elbow).fit_predict(embedding)
        if verbose:
            s_score = silhouette_score(X, opt_labels)
            print(
                f'number of clusters = {elbow} and silhouette_score = {s_score}'
            )
        return opt_labels, elbow

    else:
        if use_elbow_method:
            model = KElbowVisualizer(KMeans(), k=20, timings=False)
            elbow = model.fit(X).elbow_value_
        else:
            elbow = n_clusters
        if cluster_method == 'kmeans':
            opt_labels = KMeans(elbow).fit_predict(X)
        elif cluster_method == 'hierarchical':
            opt_labels = AgglomerativeClustering(elbow).fit_predict(X)
        print(f'silhouette score = {round(silhouette_score(X,opt_labels),3)}')
        return opt_labels, elbow
Exemplo n.º 10
0
# -*- coding: utf-8 -*-
"""
Created on Sat Oct  7 15:09:18 2017

@author: Ethan
"""

import numpy as np
from matplotlib import pyplot as plt
from sompy.sompy import SOMFactory

data = np.random.randint(0, 255, (100, 3))

dims = np.array([5, 5])
iterations = 2000
learningRate = 0.01

# normalize
data = data / data.max()

sm = SOMFactory().build(data, normalization = 'var', initialization='random', component_names=['r', 'g', 'b'])
sm.train(n_job=1, verbose=False, train_rough_len=2, train_finetune_len=5)
topographic_error = sm.calculate_topographic_error()
quantization_error = np.mean(sm._bmu[1])
Exemplo n.º 11
0
# Centroids from the hierarchical clustering with non-standardized values
scaler = StandardScaler()
to_revert = final_df_hc.groupby(['Cluster'])['Years_Education', 'Salary_Invested', 'CMV'].mean()
# final_result = pd.DataFrame(scaler.inverse_transform(X=to_revert),
#                             columns=['Years_Education', 'Salary_Invested', 'CMV'])

# 14. SOM
# 14.1. SOM FOR CONSUMPTION
# Define SOM grid size
mapsize_consump = 9

# Create algorithm, define parameters and train the grid
sm_consump = SOMFactory().build(data=std_cons.values,
                                mapsize=(mapsize_consump, mapsize_consump),
                                normalization='var',
                                initialization='random',
                                component_names=Consumption.columns,
                                lattice='rect',
                                training='batch')
sm_consump.train(n_job=6,
                 verbose='info',
                 train_rough_len=35,
                 train_finetune_len=80)

# 'final_clusters_consump' is a dataframe similar to df but including a column 'Labels' which indicates the closest neuron to each obs
final_clusters_consump = pd.DataFrame(sm_consump._data, columns=Consumption.columns).set_index(Consumption.index)
my_labels_c = pd.DataFrame(sm_consump._bmu[0], columns=['Labels']).set_index(Consumption.index)
final_clusters_consump = pd.concat([final_clusters_consump, my_labels_c], axis=1)

# Plot the number of observations associated to each neuron
vhts_c = BmuHitsView(12, 12, "Hits Map", text_size=7)
Exemplo n.º 12
0
cc_kmeans = pd.DataFrame(
    scaler.inverse_transform(X=kmeans_cust.cluster_centers_),
    columns=customer_related_num)
sizes = df["kmc_cluster"].value_counts() / len(df["kmc_cluster"])
ap_kmeans = {"cc": cc_kmeans, "sil_score": silhouette_avg, "sizes": sizes}

### 2. Approach: SOM followed by K-Means
scaler = StandardScaler()
cust_norm = scaler.fit_transform(df[customer_related_num])
df_cust_norm = pd.DataFrame(cust_norm, columns=customer_related_num)

X = df_cust_norm.values
sm = SOMFactory().build(data=X,
                        mapsize=(8, 8),
                        normalization='var',
                        initialization="pca",
                        component_names=customer_related_num,
                        lattice="hexa",
                        training="batch")
sm.train(n_job=5, verbose='info', train_rough_len=40, train_finetune_len=100)
final_clusters = pd.DataFrame(sm._data, columns=customer_related_num)
my_labels = pd.DataFrame(sm._bmu[0])
final_clusters = pd.concat([final_clusters, my_labels], axis=1)
cluster_cols = customer_related_num + ["Labels"]
final_clusters.columns = cluster_cols
som_cluster = final_clusters.groupby("Labels").mean()
#create_elbowgraph(10, som_cluster)
kmeans = KMeans(n_clusters=3, random_state=1).fit(som_cluster)
som_cluster["somk_cluster"] = kmeans.labels_
k_cluster = som_cluster.groupby("somk_cluster").mean()
k_cluster = pd.DataFrame(scaler.inverse_transform(X=k_cluster),
Exemplo n.º 13
0
    Fv *= 1.e-17

    mags = get_appmags(vs.value[::-1], Fv.value[::-1], filters, printit=False)

    if meta['AGE'][i] < 0.3:
        ##  Z.append([mags[i] for i in gunn + isubaru])
        Z.append([mags[i] for i in gunn])

##
Z = np.array(Z)

print(Z)
print('\n\n')

sm = SOMFactory().build(Z,
                        normalization='var',
                        initialization='random',
                        component_names=gunn)
sm.train(n_job=1, verbose=False, train_rough_len=2, train_finetune_len=5)

topographic_error = sm.calculate_topographic_error()
quantization_error = np.mean(sm._bmu[1])

print("Topographic error = %s; Quantization error = %s" %
      (topographic_error, quantization_error))

vhts = BmuHitsView(10, 10, 'Hits Map', text_size=7)
vhts.show(sm,
          anotate=True,
          onlyzeros=False,
          labelsize=12,
          cmap='Greys',
Exemplo n.º 14
0
df = df.drop(columns=colunas_apagar)
df = df.dropna()

for c in df:
    if df[c].dtype == 'object':
        encoder = OrdinalEncoder()
        try:
            df[c] = encoder.fit_transform(df[c].values.reshape(-1, 1))
        except TypeError:
            print('apagar ', c)
            df = df.drop(columns=c)

sm = SOMFactory().build(df.values,
                        [50, 50],
                        mask=None, mapshape='planar',
                        lattice='rect',
                        normalization='var',
                        initialization='pca',
                        component_names=list(df.columns))

sm.train(n_job=2, verbose='info', train_rough_len=30, train_finetune_len=20)

with open(
    '/content/drive/My Drive/IC_Cristine/SOM/som_primeiro.pkl',
    'wb') as arq:
    pickle.dump(sm, arq)

view2D = sompy.mapview.View2D(100, 100, "rand data", text_size=14)
view2D.show(sm, col_sz=5, which_dim="all", denormalize=True)

topographic_error = sm.calculate_topographic_error()
def SOM_clustering(data,
                   grid_size_list,
                   scale='minmax',
                   plot_grid=True,
                   n_clusters=None):
    X = scale_data(data, scale)
    terror = {}
    sm = {}
    for mapsize in grid_size_list:
        print(f'grid size = {mapsize}')
        sm[str(mapsize)] = SOMFactory().build(X,
                                              normalization='var',
                                              initialization='pca',
                                              mapsize=mapsize)
        sm[str(mapsize)].train(n_job=1,
                               verbose=False,
                               train_rough_len=4,
                               train_finetune_len=10)
        quant_error = np.array(sm[str(mapsize)]._quant_error)[-1:, 2][0]
        topo_error = sm[str(mapsize)].calculate_topographic_error()
        terror[str(mapsize)] = topo_error
        print(f'quantization error = {quant_error}')
        print(f'topographical error = {topo_error}')

    min_terror = 1
    for mapsize in grid_size_list:
        min_terror = min(min_terror, terror[str(mapsize)])
        if (min_terror == terror[str(mapsize)]):
            opt_mapsize = mapsize

    sm = sm[str(opt_mapsize)]
    x = sm.project_data(X)
    if n_clusters:
        print(f'number of clusters = {n_clusters}')
        labels, _, _ = sm.cluster(opt=n_clusters, cl_type='kmeans')
        cluster_labels = []
        for i in range(X.shape[0]):
            cluster_labels.append(labels[x[i]])
        s_score = silhouette_score(X, cluster_labels)
        print(f'silhouette score = {s_score}')
        if plot_grid:
            image_data = labels.reshape(60, 60)
            plt.figure(figsize=(25, 15))
            plt.imshow(image_data, cmap='viridis')
            plt.grid()
            plt.savefig('SOM_cluster_map.png', dpi=200)
    else:
        max_s = -1
        labels = {}
        for clust in [100, 150, 200, 250, 300]:
            print(f'number of clusters = {clust}')
            labels[clust], _, _ = sm.cluster(opt=clust, cl_type='kmeans')
            cluster_labels[clust] = []
            for i in tqdm(range(X.shape[0])):
                cluster_labels[clust].append(labels[clust][x[i]])
            s_score = silhouette_score(X, cluster_labels[clust])
            print(f'silhouette score = {s_score}')
            max_s = max(max_s, s_score)
            if (max_s == s_score):
                opt_clust = clust
        print(f'optimum number of clusters = {opt_clust}')
        if plot_grid:
            image_data = labels.reshape(60, 60)
            plt.figure(figsize=(25, 15))
            plt.imshow(image_data, cmap='viridis')
            plt.grid()
            plt.savefig('SOM_cluster_map.png', dpi=200)
Exemplo n.º 16
0
# shuffle the data
training_split_0 = shuffle(training_split_0)
validation_split_0 = shuffle(validation_split_0)

# reset the index after shuffle
training_split_0.reset_index(inplace=True, drop=True)
validation_split_0.reset_index(inplace=True, drop=True)

print(training_split_0.head())

# Train the data
mapSize = [20, 20]

sm = SOMFactory().build(training_split_0.values,
                        mapSize,
                        normalization="var",
                        lattice="rect",
                        initialization="random",
                        component_names=training_split_0.columns)
sm.train(
    n_job=1, verbose=None, train_rough_len=2,
    train_finetune_len=100)  # I left some of the codes as the example provided

# plot the results, components map
from sompy.visualization.mapview import View2D

view2D = View2D(20, 20, "", text_size=12)
view2D.show(sm, col_sz=3, which_dim="all", denormalize=False)

# Hit maps
from sompy.visualization.bmuhits import BmuHitsView
Exemplo n.º 17
0
input_data = pd.read_csv("data/DatasetNuovoConAAeBFiltrato.csv", delimiter=";")
trunc_data = input_data.drop(["Ticker", "Issuer", "Rating"], axis=1)

# Normalizzazione dataset per colonne
#trunc_data = (trunc_data - trunc_data.min() ) / ( trunc_data.max() - trunc_data.min())

#Ottengo la lista degli Header, la salvo in names e li droppo dal file del dataset
data = trunc_data
names = list(trunc_data.columns.values)
print("FEATURES: ", ", ".join(names))
data = data.values

#Alleno la SOM
#msz = calculate_msz(data)
sm = SOMFactory().build(data,
                        normalization='var',
                        initialization='random',
                        component_names=names)
sm.train(n_job=1, verbose='info', train_rough_len=2, train_finetune_len=300)

#Calcolo dell'errore topografico e di quantizzazione
topographic_error = sm.calculate_topographic_error()
quantization_error = np.mean(sm._bmu[1])
print("Topographic error = %s; Quantization error = %s" %
      (topographic_error, quantization_error))

#Visualizzazione delle component planes
from sompy.visualization.mapview import View2D
view2D = View2D(10, 10, "rand data", text_size=10)
view2D.show(sm, col_sz=4, which_dim="all", desnormalize=True)

#Visualizzazione delle BMUHitsview
Exemplo n.º 18
0
import numpy as np
from sompy.sompy import SOMFactory
from sklearn import datasets

# import iris dataset
iris = datasets.load_iris()
data = iris.data
labels = iris.target

# initialization SOM
sm = SOMFactory().build(data, normalization='var', initialization='pca')
sm.train(n_job=1, verbose=True, train_rough_len=2, train_finetune_len=5)

# The quantization error: average distance between each data vector and its BMU.
# The topographic error: the proportion of all data vectors for which first and second BMUs are not adjacent units.
topographic_error = sm.calculate_topographic_error()
quantization_error = np.mean(sm._bmu[1])
print("Topographic error = %s; Quantization error = %s" %
      (topographic_error, quantization_error))

# component planes view
from sompy.visualization.mapview import View2D
view2D = View2D(10, 10, "rand data", text_size=12)
view2D.show(sm, col_sz=4, which_dim="all", desnormalize=True)

# U-matrix plot
from sompy.visualization.umatrix import UMatrixView

umat = UMatrixView(width=10, height=10, title='U-matrix')
umat.show(sm)
Exemplo n.º 19
0
                     alpha=0.9)
##################################################
#SOMPY clustering
#practically jon's code
s_var = 'SOM'
from sompy.sompy import SOMFactory

data_std.sample(2)

data = data_std.drop([k_var, d_var], axis=1).as_matrix()
names = data_std.columns.values
print(data[1:2, ])

sm = SOMFactory().build(data,
                        mapsize=(10, 15),
                        normalization='var',
                        initialization='random',
                        component_names=names)
sm.train(n_job=4, verbose=False, train_rough_len=2, train_finetune_len=5)

topographic_error = sm.calculate_topographic_error()
quantization_error = np.mean(sm._bmu[1])
print "Topographic error = %s; Quantization error = %s" % (topographic_error,
                                                           quantization_error)

from sompy.visualization.mapview import View2D

view2D = View2D(10, 10, "rand data", text_size=10)
view2D.show(sm, col_sz=4, which_dim="all", desnormalize=True, cmap='plasma')

k_val = 4
Exemplo n.º 20
0
class MySOM:
    def __init__(self, df, mapsize, initialization='random'):
        """
        
        :param df:              数据框 
        :param mapsize:         输出层维度,一般为二维,输入(20,20)的形式
        :param initialization:  "PCA" 或 "random",初始化权重的方法
                - PCA是以变量的主成分值作为权重,见sompy.codebool.pca_linear_initialization
                - random是以随机数进行初始化
        """
        self.data = np.array(df)
        self.sm = SOMFactory().build(self.data,
                                     mapsize=mapsize,
                                     initialization=initialization,
                                     component_names=df.columns)
        self.train()

    def train(self):
        self.sm.train(n_job=1,
                      verbose=False,
                      train_rough_len=2,
                      train_finetune_len=5)

    def print_error(self):
        topographic_error = self.sm.calculate_topographic_error()
        quantization_error = np.mean(self.sm._bmu[1])
        print("Topographic error = %s; Quantization error = %s" %
              (topographic_error, quantization_error))

    def draw_input_weights(self):
        from sompy.visualization.mapview import View2D
        view2D = View2D(10, 10, "rand data", text_size=10)
        view2D.show(self.sm, col_sz=4, which_dim="all", desnormalize=True)
        plt.show()

    def draw_hit_map(self):
        from sompy.visualization.bmuhits import BmuHitsView
        vhts = BmuHitsView(4, 4, "Hits Map", text_size=12)
        vhts.show(self.sm,
                  anotate=True,
                  onlyzeros=False,
                  labelsize=12,
                  cmap="Greys",
                  logaritmic=False)
        plt.show()

    def draw_cluster_map(self):
        from sompy.visualization.hitmap import HitMapView
        hits = HitMapView(20, 20, "Clustering", text_size=12)
        hits.show(self.sm)
        plt.show()

    def cluster(self, n):
        self.sm.cluster(n)

    def get_cluster_label(self):
        # 长度等于mapsize[0] * mapsize[1]
        return self.sm.cluster_labels

    def get_neurons(self):
        """
        获取原数据的每个样本对应的神经元,原包并未提供此方法,所以自己动手
        :return: array, length = self.df.shape[0]
        """
        return self.sm._bmu[0]

    def get_label(self):
        """
        获取原数据的每个样本对应的分类标签,原包并未提供此方法,所以自己动手
        :return: array, length = self.df.shape[0]
        """
        neurons_label_dict = {
            i: j
            for i, j in enumerate(self.sm.cluster_labels)
        }
        return np.array([neurons_label_dict[i] for i in self.sm._bmu[0]])

    def predict(self, x):
        """
        以label作为y,采取各种机器学习算法
        :param x: 
        :return: 
        """
        pass
Exemplo n.º 21
0
fout_train = tfpinit.fout_train

# ---- read data from csv via pandas
fluid_data_df = pd.read_csv(fin,
                            index_col='Name', usecols=input_tfprop)# index_col='Name', by xinyuewang
fluid_data_df = fluid_data_df[input_tfprop[1:]]  # reorder of column
fluid_name_df = pd.DataFrame(fluid_data_df.index)

columns = np.array(tfpinit.name_tfprop) if tfpinit.name_tfprop \
          else np.array(fluid_data_df.columns)
# names = np.array(fluid_name_df)
#descr = fluid_data_df.as_matrix()
descr = fluid_data_df.to_numpy()

# make SOM instance
sm = SOMFactory.build(descr, mapsize=mapsize, normalization='var',
                      initialization='pca', component_names=columns)

if __name__ == "__main__":
    # execute SOM training
    sm.train(n_job=n_job, verbose='debug', train_rough_len=0,
             train_finetune_len=0)

    topograpphic_error = sm.calculate_topographic_error()
    quantization_error = np.mean(sm._bmu[1])
    print("Topographic error = {}; Quantization error = {};"
          .format(topograpphic_error, quantization_error))

    # output sm.codebook.matrix as HDF5 format
    if isOutTrain:
        print("Saving SOM trained data to {}...".format(fout_train))
        out_df = pd.DataFrame(sm.codebook.matrix, columns=input_tfprop[1:])
Exemplo n.º 22
0
# get columns Lat, Long, Mean Temp, Max Temp, Min temp, Precipitation
data = concatenated_df[['Lat', 'Long', 'Tm', 'Tx', 'Tn', 'P']]
data = data.apply(pd.to_numeric, errors='coerce')
data = data.dropna(how='any')
names = [
    'Latitude', "longitude", 'Monthly Median temperature (C)',
    'Monthly Max temperature (C)', 'Monthly Min temperature (C)',
    'Monthly total precipitation (mm)'
]

print(data.head())

# create the SOM network and train it. You can experiment with different normalizations and initializations
sm = SOMFactory().build(data.values,
                        normalization='var',
                        initialization='pca',
                        component_names=names)
sm.train(n_job=1, verbose=False, train_rough_len=2, train_finetune_len=5)

# The quantization error: average distance between each data vector and its BMU.
# The topographic error: the proportion of all data vectors for which first and second BMUs are not adjacent units.
topographic_error = sm.calculate_topographic_error()
quantization_error = np.mean(sm._bmu[1])
print("Topographic error = %s; Quantization error = %s" %
      (topographic_error, quantization_error))

# component planes view
from sompy.visualization.mapview import View2D
view2D = View2D(10, 10, "rand data", text_size=12)
view2D.show(sm, col_sz=4, which_dim="all", desnormalize=True)
Exemplo n.º 23
0
# Resetting index so it goes in order, since I only selected april, a standard index is all messed up april 2008 to april 2009 skips 
#a ton of index values, and this makes it impossible to combine cluster output index with df 
df.index
df = df.reset_index(level=0)
del df['index']
#names = ['zP', 'zVPD', 'pr' , 'vpd', 'Latitude', 'Longitude', 'cum_daysinarow_lowpr','tmmx','tmmn','daysabove28','NDVI','zNDVI']
#names = ['pr','vpd','cum_daysinarow_lowpr','tmmx','NDVI', 'Latitude', 'Longitude']



##Investigating
df.info()

som = None
# create the SOM network and train it. You can experiment with different normalizations and initializations
som = SOMFactory().build(df.values, mapsize=[50,50], normalization = 'var', initialization='pca', component_names=names)
som.train(n_job=1, verbose=False, train_rough_len=2, train_finetune_len=5)

# The quantization error: average distance between each data vector and its BMU.
# The topographic error: the proportion of all data vectors for which first and second BMUs are not adjacent units.
topographic_error = som.calculate_topographic_error()
quantization_error = np.mean(som._bmu[1])
print "Topographic error = %s; Quantization error = %s" % (topographic_error, quantization_error)

from sompy.visualization.mapview import View2D
view2D  = View2D(4,4,"rand data",text_size=16)
view2D.show(som, col_sz=2, which_dim="all", desnormalize=True)

# U-matrix plot
from sompy.visualization.umatrix import UMatrixView
Exemplo n.º 24
0
#a ton of index values, and this makes it impossible to combine cluster output index with df
df.index
df = df.reset_index(level=0)
del df['index']

##Investigating
# 5*sqrt(row*column)
df.info()
dfdrop = df.drop_duplicates()
dfdrop.info()

topo = []
quant = []
array = np.arange(10, 25, 1)
for i in array:
    som = SOMFactory().build(df.values, mapsize=[i,i], normalization = 'var', initialization='pca', component_names=names,\
                    neighborhood = 'gaussian', lattice='rect')
    som.train(n_job=1,
              verbose='info',
              train_rough_len=50,
              train_rough_radiusin=4,
              train_finetune_radiusin=1,
              train_finetune_len=50)
    topo.append(som.calculate_topographic_error())
    quant.append(np.mean(som._bmu[1]))
    print i
plt.scatter(topo, quant, c=array, s=50)
plt.title('Self Organizing Map')
plt.xlabel('Topographic Error')
plt.ylabel('Quantization Error')
plt.colorbar(label='grid size nxn')