def sm_training(self): """ Train the model with different parameters. """ file=askopenfilename(initialdir=dir_name, title="Select Data", filetypes=[("csv files", "*.csv")]) if file is None: tk.messagebox.showerror("Error","your chosen file is not valid. \n Please choose again.") content=open(file, "rb") data=pd.read_csv(content) # ind=data[data.columns[0]] # data = data.set_index(ind) comp_names=[name for name in data.columns] index = data.index # test cali housing first df=data.fillna(0).values # initialize the build sm=SOMFactory().build( data=df, mapsize=(int(self.Mapsize_x.get()), int(self.Mapsize_y.get())), mask=None, mapshape='planar', lattice=self.Lattice_ent.get(), normalization=self.Normalization_ent.get(), initialization=self.Initialization_ent.get(), neighborhood='gaussian', training='batch', name='sompy', component_names=self.comp_names) # start training sm.train(n_job=int(self.n_job_ent.get()), shared_memory=self.shared_memory_ent.get(), verbose=self.verbose_ent.get(), train_rough_len=int(self.train_rough_len_ent.get()), train_rough_radiusin=int(self.train_rough_rin_ent.get()), train_rough_radiusfin=int(self.train_rough_rfin_ent.get()), train_finetune_len=int(self.train_ft_len_ent.get()), train_finetune_radiusin=int(self.train_ft_rin_ent.get()), train_finetune_radiusfin=int(self.train_ft_rfin_ent.get()), train_len_factor=int(self.train_len_factor_ent.get()), maxtrainlen=np.Inf) # errors calculation topographic_error=sm.calculate_topographic_error() quantitization_error=np.mean(sm._bmu[1]) # if multiple runs are required # joblib.dump(sm, "model_{}.joblib".format(i)) pickle.dump(sm, open("Models/sm_model", "wb")) # print errors on the cmd prompt print("the topographic error is %s " % topographic_error) print("the quantitization error is %s " % quantitization_error)
def __init__(self, df, mapsize, initialization='random'): """ :param df: 数据框 :param mapsize: 输出层维度,一般为二维,输入(20,20)的形式 :param initialization: "PCA" 或 "random",初始化权重的方法 - PCA是以变量的主成分值作为权重,见sompy.codebool.pca_linear_initialization - random是以随机数进行初始化 """ self.data = np.array(df) self.sm = SOMFactory().build(self.data, mapsize=mapsize, initialization=initialization, component_names=df.columns) self.train()
def build_som(self, X): print('Building SOM...') sm = SOMFactory().build(X, normalization='var', mapsize=(15, 15), initialization='pca') sm.train(n_job=1, verbose='info', train_rough_len=200, train_finetune_len=100) topographic_error = sm.calculate_topographic_error() quantization_error = np.mean(sm._bmu[1]) print ("Topographic error = {}; Quantization error = {}"\ .format(topographic_error,quantization_error)) return sm
def soms(data): """ Input: 3-D array (nt,ny,nx) """ #Reshape data nt, ny, nx = data.shape data = np.reshape(data, [nt, ny * nx], order='F') sm = SOMFactory().build(data, mapsize=(5, 5), normalization=None, initialization='pca') sm.train(n_job=-1, verbose=False, train_rough_len=20, train_finetune_len=10) return sm
def training_batched_som(map_min_size, map_max_size, nb_models, X_train): for i in range(nb_models): sm = SOMFactory().build( X_train, mapsize=[ random.choice(list(range(map_min_size, map_max_size))), random.choice(list(range(map_min_size, map_max_size))) ], normalization='var', initialization='random', component_names=names, lattice="hexa") sm.train(n_job=1, verbose=False, train_rough_len=30, train_finetune_len=100) joblib.dump(sm, path + "batched_model_{}.joblib".format(i)) print("end of training model n°" + str(i)) # Study the models trained and plot the errors obtained in order to select the best one models_pool = glob.glob(path + "batched_model*") errors = [] for model_filepath in models_pool: sm = joblib.load(model_filepath) topographic_error = sm.calculate_topographic_error() quantization_error = sm.calculate_quantization_error() errors.append((topographic_error, quantization_error)) e_top, e_q = zip(*errors) plt.scatter(e_top, e_q) plt.xlabel("Topographic error") plt.ylabel("Quantization error") plt.title("Topographic and quantization errors of the models trained") plt.show()
def _prediction(self): """SOM function""" try: data = np.loadtxt('/home/mininet/testmininet/trainingdata1.txt', delimiter=',') names = [ 'Interval', 'Throughput(Mbits/0.5sec)', 'Bandwidth(Mbits/sec)', 'Jitter(ms)', 'Loss', 'Decision' ] sm = SOMFactory().build(data, normalization='var', initialization='random', component_names=names) sm.train(n_job=1, verbose='info', train_rough_len=15, train_finetune_len=15) topographic_error = sm.calculate_topographic_error() quantization_error = np.mean(sm._bmu[1]) line = open('/home/mininet/testmininet/pdata1.txt').readlines() log.debug(line) comp = line[0].split(",") del comp[len(comp) - 1] data2 = np.array([[ float(comp[0]), float(comp[1]), float(comp[2]), float(comp[3]), float(comp[4]) ]]) sm.cluster(5) pred = np.absolute(sm.predict_by(data2, 5)) self.details.write(comp[4] + "\t" + comp[1] + "\t" + str(pred[0]) + "\t" + str(topographic_error) + "\n") print(pred) if pred <= 0.5: print("No congestion") self._congdelay(pred) elif pred > 0.5: print("Congestion there for next 5 seconds atleast") self.prevpred = pred except IndexError: print("ERROR")
def self_organizing_map(normalized_df, normalization='var', initialization='pca', n_job=1, train_rough_len=2, train_finetune_len=5, verbose=None): # create the SOM network and train it. You can experiment with different normalizations and initializations som = SOMFactory().build(normalized_df.values, normalization=normalization, initialization=initialization, component_names=normalized_df.columns) som.train(n_job=n_job, train_rough_len=train_rough_len, train_finetune_len=train_finetune_len, verbose=verbose) # The quantization error: average distance between each data vector and its BMU. # The topographic error: the proportion of all data vectors for which first and second BMUs are not adjacent units. topographic_error = som.calculate_topographic_error() quantization_error = np.mean(som._bmu[1]) print("Topographic error = %s; Quantization error = %s" % (topographic_error, quantization_error)) return som
def training_specific_som(map_x_size, map_y_size, X_train): sm = SOMFactory().build(X_train, mapsize=[map_x_size, map_y_size], normalization='var', initialization='random', component_names=names, lattice='hexa') sm.train(n_job=1, verbose=False, train_rough_len=30, train_finetune_len=100) joblib.dump(sm, path + "batched_model_specific{}.joblib".format(0)) print("Topographic error: " + str(sm.calculate_topographic_error()) + ", Quantization error: " + str(sm.calculate_quantization_error()) + "\n") return (sm)
def cluster_category_data(df, scale_data='minmax', dim_red_method='som', use_elbow_method='True', cluster_method='hierarchical', n_clusters=None, verbose=1, perplexity=None): """ :param df: dataframe containing all the columns belonging to a category to be used in clustering :param scale_data: method to be used to scale the dataset :param dim_red_method: options are 'som', 'umap', 'tsne', None. If None, do clustering directly. :param use_elbow_method: if True, elbow method is used to find the optimum number of clusters. If False, n_clusters needs to be specified :param cluster_method: options are 'kmeans' and 'hierarchical'. In either case kmeans is used for the elbow method(because of the time required). :param n_clusters: If use_elbow_method is False, n_clusters needs to be given. :param verbose: If True, output the progress in clustering process :param perplexity: If method used is TSNE, perplexity nedds to be specified """ t = time.time() if scale_data == 'minmax': X = MinMaxScaler().fit_transform(df) elif scale_data == 'standard': X = StandardScaler().fit_transform(df) else: X = df.values if verbose: print(f'number of features = {df.shape[1]}') if dim_red_method == 'som': if verbose: print( 'Self Organising Maps is being used for dimensionality reduction...' ) opt_k = 2 max_s = -1 f = 0 for mapsize in [(30, 30)]: if verbose: print(f'map size = {mapsize}') sm = SOMFactory().build(X, normalization='var', initialization='pca', mapsize=mapsize) sm.train(n_job=1, verbose=False, train_rough_len=100, train_finetune_len=500) if use_elbow_method: model = KElbowVisualizer(KMeans(), k=20, timings=False) elbow = model.fit(sm.codebook.matrix).elbow_value_ if elbow and verbose: print(f'elbow value = {elbow}') if not elbow: if verbose: print('elbow not found') ms = -1 for k in range(2, 20): km_labels = KMeans(k).fit_predict(sm.codebook.matrix) s = silhouette_score(sm.codebook.matrix, km_labels) if s > ms: elbow = k else: elbow = n_clusters x = sm.project_data(X) labels, _, _ = sm.cluster(opt=elbow, cl_type=cluster_method) clabels = [] for i in range(X.shape[0]): clabels.append(labels[x[i]]) s_score = silhouette_score(X, clabels) if verbose: print(f'silhouette score = {round(s_score, 3)}') max_s = max(s_score, max_s) if (max_s == s_score): opt_k = elbow opt_labels = clabels opt_size = mapsize if (max_s > s_score): break if verbose: print(f'optimum mapsize = {opt_size}') print( f'optimum number of clusters = {opt_k} & silhouette score = {round(max_s,3)}' ) print(f'time taken = {round(time.time()-t,1)}') return opt_labels, opt_k elif dim_red_method: if dim_red_method == 'umap': print('UMAP is being used for dimensionality reduction...') embedding = umap.UMAP(n_components=2, n_neighbors=5, min_dist=0.0001, metric='euclidean', random_state=1, spread=0.5, n_epochs=1000).fit_transform(X) print('UMAP embedding done...') elif dim_red_method == 'tsne': print('t-SNE is being used for dimensionality reduction...') embedding = TSNE(perplexity=perplexity).fit_transform(X) print('t-SNE embedding is done...') if use_elbow_method: model = KElbowVisualizer(KMeans(), k=20, timings=False) elbow = model.fit(embedding).elbow_value_ else: elbow = n_clusters if cluster_method == 'kmeans': opt_labels = KMeans(elbow).fit_predict(embedding) elif cluster_method == 'hierarchical': opt_labels = AgglomerativeClustering(elbow).fit_predict(embedding) if verbose: s_score = silhouette_score(X, opt_labels) print( f'number of clusters = {elbow} and silhouette_score = {s_score}' ) return opt_labels, elbow else: if use_elbow_method: model = KElbowVisualizer(KMeans(), k=20, timings=False) elbow = model.fit(X).elbow_value_ else: elbow = n_clusters if cluster_method == 'kmeans': opt_labels = KMeans(elbow).fit_predict(X) elif cluster_method == 'hierarchical': opt_labels = AgglomerativeClustering(elbow).fit_predict(X) print(f'silhouette score = {round(silhouette_score(X,opt_labels),3)}') return opt_labels, elbow
# -*- coding: utf-8 -*- """ Created on Sat Oct 7 15:09:18 2017 @author: Ethan """ import numpy as np from matplotlib import pyplot as plt from sompy.sompy import SOMFactory data = np.random.randint(0, 255, (100, 3)) dims = np.array([5, 5]) iterations = 2000 learningRate = 0.01 # normalize data = data / data.max() sm = SOMFactory().build(data, normalization = 'var', initialization='random', component_names=['r', 'g', 'b']) sm.train(n_job=1, verbose=False, train_rough_len=2, train_finetune_len=5) topographic_error = sm.calculate_topographic_error() quantization_error = np.mean(sm._bmu[1])
# Centroids from the hierarchical clustering with non-standardized values scaler = StandardScaler() to_revert = final_df_hc.groupby(['Cluster'])['Years_Education', 'Salary_Invested', 'CMV'].mean() # final_result = pd.DataFrame(scaler.inverse_transform(X=to_revert), # columns=['Years_Education', 'Salary_Invested', 'CMV']) # 14. SOM # 14.1. SOM FOR CONSUMPTION # Define SOM grid size mapsize_consump = 9 # Create algorithm, define parameters and train the grid sm_consump = SOMFactory().build(data=std_cons.values, mapsize=(mapsize_consump, mapsize_consump), normalization='var', initialization='random', component_names=Consumption.columns, lattice='rect', training='batch') sm_consump.train(n_job=6, verbose='info', train_rough_len=35, train_finetune_len=80) # 'final_clusters_consump' is a dataframe similar to df but including a column 'Labels' which indicates the closest neuron to each obs final_clusters_consump = pd.DataFrame(sm_consump._data, columns=Consumption.columns).set_index(Consumption.index) my_labels_c = pd.DataFrame(sm_consump._bmu[0], columns=['Labels']).set_index(Consumption.index) final_clusters_consump = pd.concat([final_clusters_consump, my_labels_c], axis=1) # Plot the number of observations associated to each neuron vhts_c = BmuHitsView(12, 12, "Hits Map", text_size=7)
cc_kmeans = pd.DataFrame( scaler.inverse_transform(X=kmeans_cust.cluster_centers_), columns=customer_related_num) sizes = df["kmc_cluster"].value_counts() / len(df["kmc_cluster"]) ap_kmeans = {"cc": cc_kmeans, "sil_score": silhouette_avg, "sizes": sizes} ### 2. Approach: SOM followed by K-Means scaler = StandardScaler() cust_norm = scaler.fit_transform(df[customer_related_num]) df_cust_norm = pd.DataFrame(cust_norm, columns=customer_related_num) X = df_cust_norm.values sm = SOMFactory().build(data=X, mapsize=(8, 8), normalization='var', initialization="pca", component_names=customer_related_num, lattice="hexa", training="batch") sm.train(n_job=5, verbose='info', train_rough_len=40, train_finetune_len=100) final_clusters = pd.DataFrame(sm._data, columns=customer_related_num) my_labels = pd.DataFrame(sm._bmu[0]) final_clusters = pd.concat([final_clusters, my_labels], axis=1) cluster_cols = customer_related_num + ["Labels"] final_clusters.columns = cluster_cols som_cluster = final_clusters.groupby("Labels").mean() #create_elbowgraph(10, som_cluster) kmeans = KMeans(n_clusters=3, random_state=1).fit(som_cluster) som_cluster["somk_cluster"] = kmeans.labels_ k_cluster = som_cluster.groupby("somk_cluster").mean() k_cluster = pd.DataFrame(scaler.inverse_transform(X=k_cluster),
Fv *= 1.e-17 mags = get_appmags(vs.value[::-1], Fv.value[::-1], filters, printit=False) if meta['AGE'][i] < 0.3: ## Z.append([mags[i] for i in gunn + isubaru]) Z.append([mags[i] for i in gunn]) ## Z = np.array(Z) print(Z) print('\n\n') sm = SOMFactory().build(Z, normalization='var', initialization='random', component_names=gunn) sm.train(n_job=1, verbose=False, train_rough_len=2, train_finetune_len=5) topographic_error = sm.calculate_topographic_error() quantization_error = np.mean(sm._bmu[1]) print("Topographic error = %s; Quantization error = %s" % (topographic_error, quantization_error)) vhts = BmuHitsView(10, 10, 'Hits Map', text_size=7) vhts.show(sm, anotate=True, onlyzeros=False, labelsize=12, cmap='Greys',
df = df.drop(columns=colunas_apagar) df = df.dropna() for c in df: if df[c].dtype == 'object': encoder = OrdinalEncoder() try: df[c] = encoder.fit_transform(df[c].values.reshape(-1, 1)) except TypeError: print('apagar ', c) df = df.drop(columns=c) sm = SOMFactory().build(df.values, [50, 50], mask=None, mapshape='planar', lattice='rect', normalization='var', initialization='pca', component_names=list(df.columns)) sm.train(n_job=2, verbose='info', train_rough_len=30, train_finetune_len=20) with open( '/content/drive/My Drive/IC_Cristine/SOM/som_primeiro.pkl', 'wb') as arq: pickle.dump(sm, arq) view2D = sompy.mapview.View2D(100, 100, "rand data", text_size=14) view2D.show(sm, col_sz=5, which_dim="all", denormalize=True) topographic_error = sm.calculate_topographic_error()
def SOM_clustering(data, grid_size_list, scale='minmax', plot_grid=True, n_clusters=None): X = scale_data(data, scale) terror = {} sm = {} for mapsize in grid_size_list: print(f'grid size = {mapsize}') sm[str(mapsize)] = SOMFactory().build(X, normalization='var', initialization='pca', mapsize=mapsize) sm[str(mapsize)].train(n_job=1, verbose=False, train_rough_len=4, train_finetune_len=10) quant_error = np.array(sm[str(mapsize)]._quant_error)[-1:, 2][0] topo_error = sm[str(mapsize)].calculate_topographic_error() terror[str(mapsize)] = topo_error print(f'quantization error = {quant_error}') print(f'topographical error = {topo_error}') min_terror = 1 for mapsize in grid_size_list: min_terror = min(min_terror, terror[str(mapsize)]) if (min_terror == terror[str(mapsize)]): opt_mapsize = mapsize sm = sm[str(opt_mapsize)] x = sm.project_data(X) if n_clusters: print(f'number of clusters = {n_clusters}') labels, _, _ = sm.cluster(opt=n_clusters, cl_type='kmeans') cluster_labels = [] for i in range(X.shape[0]): cluster_labels.append(labels[x[i]]) s_score = silhouette_score(X, cluster_labels) print(f'silhouette score = {s_score}') if plot_grid: image_data = labels.reshape(60, 60) plt.figure(figsize=(25, 15)) plt.imshow(image_data, cmap='viridis') plt.grid() plt.savefig('SOM_cluster_map.png', dpi=200) else: max_s = -1 labels = {} for clust in [100, 150, 200, 250, 300]: print(f'number of clusters = {clust}') labels[clust], _, _ = sm.cluster(opt=clust, cl_type='kmeans') cluster_labels[clust] = [] for i in tqdm(range(X.shape[0])): cluster_labels[clust].append(labels[clust][x[i]]) s_score = silhouette_score(X, cluster_labels[clust]) print(f'silhouette score = {s_score}') max_s = max(max_s, s_score) if (max_s == s_score): opt_clust = clust print(f'optimum number of clusters = {opt_clust}') if plot_grid: image_data = labels.reshape(60, 60) plt.figure(figsize=(25, 15)) plt.imshow(image_data, cmap='viridis') plt.grid() plt.savefig('SOM_cluster_map.png', dpi=200)
# shuffle the data training_split_0 = shuffle(training_split_0) validation_split_0 = shuffle(validation_split_0) # reset the index after shuffle training_split_0.reset_index(inplace=True, drop=True) validation_split_0.reset_index(inplace=True, drop=True) print(training_split_0.head()) # Train the data mapSize = [20, 20] sm = SOMFactory().build(training_split_0.values, mapSize, normalization="var", lattice="rect", initialization="random", component_names=training_split_0.columns) sm.train( n_job=1, verbose=None, train_rough_len=2, train_finetune_len=100) # I left some of the codes as the example provided # plot the results, components map from sompy.visualization.mapview import View2D view2D = View2D(20, 20, "", text_size=12) view2D.show(sm, col_sz=3, which_dim="all", denormalize=False) # Hit maps from sompy.visualization.bmuhits import BmuHitsView
input_data = pd.read_csv("data/DatasetNuovoConAAeBFiltrato.csv", delimiter=";") trunc_data = input_data.drop(["Ticker", "Issuer", "Rating"], axis=1) # Normalizzazione dataset per colonne #trunc_data = (trunc_data - trunc_data.min() ) / ( trunc_data.max() - trunc_data.min()) #Ottengo la lista degli Header, la salvo in names e li droppo dal file del dataset data = trunc_data names = list(trunc_data.columns.values) print("FEATURES: ", ", ".join(names)) data = data.values #Alleno la SOM #msz = calculate_msz(data) sm = SOMFactory().build(data, normalization='var', initialization='random', component_names=names) sm.train(n_job=1, verbose='info', train_rough_len=2, train_finetune_len=300) #Calcolo dell'errore topografico e di quantizzazione topographic_error = sm.calculate_topographic_error() quantization_error = np.mean(sm._bmu[1]) print("Topographic error = %s; Quantization error = %s" % (topographic_error, quantization_error)) #Visualizzazione delle component planes from sompy.visualization.mapview import View2D view2D = View2D(10, 10, "rand data", text_size=10) view2D.show(sm, col_sz=4, which_dim="all", desnormalize=True) #Visualizzazione delle BMUHitsview
import numpy as np from sompy.sompy import SOMFactory from sklearn import datasets # import iris dataset iris = datasets.load_iris() data = iris.data labels = iris.target # initialization SOM sm = SOMFactory().build(data, normalization='var', initialization='pca') sm.train(n_job=1, verbose=True, train_rough_len=2, train_finetune_len=5) # The quantization error: average distance between each data vector and its BMU. # The topographic error: the proportion of all data vectors for which first and second BMUs are not adjacent units. topographic_error = sm.calculate_topographic_error() quantization_error = np.mean(sm._bmu[1]) print("Topographic error = %s; Quantization error = %s" % (topographic_error, quantization_error)) # component planes view from sompy.visualization.mapview import View2D view2D = View2D(10, 10, "rand data", text_size=12) view2D.show(sm, col_sz=4, which_dim="all", desnormalize=True) # U-matrix plot from sompy.visualization.umatrix import UMatrixView umat = UMatrixView(width=10, height=10, title='U-matrix') umat.show(sm)
alpha=0.9) ################################################## #SOMPY clustering #practically jon's code s_var = 'SOM' from sompy.sompy import SOMFactory data_std.sample(2) data = data_std.drop([k_var, d_var], axis=1).as_matrix() names = data_std.columns.values print(data[1:2, ]) sm = SOMFactory().build(data, mapsize=(10, 15), normalization='var', initialization='random', component_names=names) sm.train(n_job=4, verbose=False, train_rough_len=2, train_finetune_len=5) topographic_error = sm.calculate_topographic_error() quantization_error = np.mean(sm._bmu[1]) print "Topographic error = %s; Quantization error = %s" % (topographic_error, quantization_error) from sompy.visualization.mapview import View2D view2D = View2D(10, 10, "rand data", text_size=10) view2D.show(sm, col_sz=4, which_dim="all", desnormalize=True, cmap='plasma') k_val = 4
class MySOM: def __init__(self, df, mapsize, initialization='random'): """ :param df: 数据框 :param mapsize: 输出层维度,一般为二维,输入(20,20)的形式 :param initialization: "PCA" 或 "random",初始化权重的方法 - PCA是以变量的主成分值作为权重,见sompy.codebool.pca_linear_initialization - random是以随机数进行初始化 """ self.data = np.array(df) self.sm = SOMFactory().build(self.data, mapsize=mapsize, initialization=initialization, component_names=df.columns) self.train() def train(self): self.sm.train(n_job=1, verbose=False, train_rough_len=2, train_finetune_len=5) def print_error(self): topographic_error = self.sm.calculate_topographic_error() quantization_error = np.mean(self.sm._bmu[1]) print("Topographic error = %s; Quantization error = %s" % (topographic_error, quantization_error)) def draw_input_weights(self): from sompy.visualization.mapview import View2D view2D = View2D(10, 10, "rand data", text_size=10) view2D.show(self.sm, col_sz=4, which_dim="all", desnormalize=True) plt.show() def draw_hit_map(self): from sompy.visualization.bmuhits import BmuHitsView vhts = BmuHitsView(4, 4, "Hits Map", text_size=12) vhts.show(self.sm, anotate=True, onlyzeros=False, labelsize=12, cmap="Greys", logaritmic=False) plt.show() def draw_cluster_map(self): from sompy.visualization.hitmap import HitMapView hits = HitMapView(20, 20, "Clustering", text_size=12) hits.show(self.sm) plt.show() def cluster(self, n): self.sm.cluster(n) def get_cluster_label(self): # 长度等于mapsize[0] * mapsize[1] return self.sm.cluster_labels def get_neurons(self): """ 获取原数据的每个样本对应的神经元,原包并未提供此方法,所以自己动手 :return: array, length = self.df.shape[0] """ return self.sm._bmu[0] def get_label(self): """ 获取原数据的每个样本对应的分类标签,原包并未提供此方法,所以自己动手 :return: array, length = self.df.shape[0] """ neurons_label_dict = { i: j for i, j in enumerate(self.sm.cluster_labels) } return np.array([neurons_label_dict[i] for i in self.sm._bmu[0]]) def predict(self, x): """ 以label作为y,采取各种机器学习算法 :param x: :return: """ pass
fout_train = tfpinit.fout_train # ---- read data from csv via pandas fluid_data_df = pd.read_csv(fin, index_col='Name', usecols=input_tfprop)# index_col='Name', by xinyuewang fluid_data_df = fluid_data_df[input_tfprop[1:]] # reorder of column fluid_name_df = pd.DataFrame(fluid_data_df.index) columns = np.array(tfpinit.name_tfprop) if tfpinit.name_tfprop \ else np.array(fluid_data_df.columns) # names = np.array(fluid_name_df) #descr = fluid_data_df.as_matrix() descr = fluid_data_df.to_numpy() # make SOM instance sm = SOMFactory.build(descr, mapsize=mapsize, normalization='var', initialization='pca', component_names=columns) if __name__ == "__main__": # execute SOM training sm.train(n_job=n_job, verbose='debug', train_rough_len=0, train_finetune_len=0) topograpphic_error = sm.calculate_topographic_error() quantization_error = np.mean(sm._bmu[1]) print("Topographic error = {}; Quantization error = {};" .format(topograpphic_error, quantization_error)) # output sm.codebook.matrix as HDF5 format if isOutTrain: print("Saving SOM trained data to {}...".format(fout_train)) out_df = pd.DataFrame(sm.codebook.matrix, columns=input_tfprop[1:])
# get columns Lat, Long, Mean Temp, Max Temp, Min temp, Precipitation data = concatenated_df[['Lat', 'Long', 'Tm', 'Tx', 'Tn', 'P']] data = data.apply(pd.to_numeric, errors='coerce') data = data.dropna(how='any') names = [ 'Latitude', "longitude", 'Monthly Median temperature (C)', 'Monthly Max temperature (C)', 'Monthly Min temperature (C)', 'Monthly total precipitation (mm)' ] print(data.head()) # create the SOM network and train it. You can experiment with different normalizations and initializations sm = SOMFactory().build(data.values, normalization='var', initialization='pca', component_names=names) sm.train(n_job=1, verbose=False, train_rough_len=2, train_finetune_len=5) # The quantization error: average distance between each data vector and its BMU. # The topographic error: the proportion of all data vectors for which first and second BMUs are not adjacent units. topographic_error = sm.calculate_topographic_error() quantization_error = np.mean(sm._bmu[1]) print("Topographic error = %s; Quantization error = %s" % (topographic_error, quantization_error)) # component planes view from sompy.visualization.mapview import View2D view2D = View2D(10, 10, "rand data", text_size=12) view2D.show(sm, col_sz=4, which_dim="all", desnormalize=True)
# Resetting index so it goes in order, since I only selected april, a standard index is all messed up april 2008 to april 2009 skips #a ton of index values, and this makes it impossible to combine cluster output index with df df.index df = df.reset_index(level=0) del df['index'] #names = ['zP', 'zVPD', 'pr' , 'vpd', 'Latitude', 'Longitude', 'cum_daysinarow_lowpr','tmmx','tmmn','daysabove28','NDVI','zNDVI'] #names = ['pr','vpd','cum_daysinarow_lowpr','tmmx','NDVI', 'Latitude', 'Longitude'] ##Investigating df.info() som = None # create the SOM network and train it. You can experiment with different normalizations and initializations som = SOMFactory().build(df.values, mapsize=[50,50], normalization = 'var', initialization='pca', component_names=names) som.train(n_job=1, verbose=False, train_rough_len=2, train_finetune_len=5) # The quantization error: average distance between each data vector and its BMU. # The topographic error: the proportion of all data vectors for which first and second BMUs are not adjacent units. topographic_error = som.calculate_topographic_error() quantization_error = np.mean(som._bmu[1]) print "Topographic error = %s; Quantization error = %s" % (topographic_error, quantization_error) from sompy.visualization.mapview import View2D view2D = View2D(4,4,"rand data",text_size=16) view2D.show(som, col_sz=2, which_dim="all", desnormalize=True) # U-matrix plot from sompy.visualization.umatrix import UMatrixView
#a ton of index values, and this makes it impossible to combine cluster output index with df df.index df = df.reset_index(level=0) del df['index'] ##Investigating # 5*sqrt(row*column) df.info() dfdrop = df.drop_duplicates() dfdrop.info() topo = [] quant = [] array = np.arange(10, 25, 1) for i in array: som = SOMFactory().build(df.values, mapsize=[i,i], normalization = 'var', initialization='pca', component_names=names,\ neighborhood = 'gaussian', lattice='rect') som.train(n_job=1, verbose='info', train_rough_len=50, train_rough_radiusin=4, train_finetune_radiusin=1, train_finetune_len=50) topo.append(som.calculate_topographic_error()) quant.append(np.mean(som._bmu[1])) print i plt.scatter(topo, quant, c=array, s=50) plt.title('Self Organizing Map') plt.xlabel('Topographic Error') plt.ylabel('Quantization Error') plt.colorbar(label='grid size nxn')