def MySOM05(img1, ImageType, numClust): #normalizing image data img2 = img1.astype(float) img = img2 / 255 #diff constants for use num_features = img.shape[2] num_samples = img.shape[0] * img.shape[1] img_height = img.shape[0] img_width = img.shape[1] #reshape img img_vector = np.reshape(img, (num_samples, num_features)) #for hyper simply change the img_vector after pca with 3 dimensions if (ImageType == 'Hyper'): pca = PCA(3) principalComponents = pca.fit_transform(img_vector) img_vector = principalComponents #map sizeis 25X25 N = 25 som = SOM((N, N), img_vector) som.set_parameter(neighbor=0.1, learning_rate=0.2) output_map = som.train(100000) # print output_map.shape somMap = output_map.reshape([output_map.shape[0] * output_map.shape[1], 3]) # kmeans to find new centroids kmeans2 = sklearn.cluster.KMeans(n_clusters=numClust, init="k-means++", max_iter=10).fit(somMap) uT = kmeans2.labels_ new_centroids = kmeans2.cluster_centers_ # use new_centroids for kmeans of 1 iteration only to find final output kmeans3 = sklearn.cluster.KMeans(n_clusters=numClust, init=new_centroids, n_init=1, max_iter=300).fit(img_vector) ClusterIm = np.reshape(kmeans3.labels_, (img_height, img_width)) ccImOneBase = getCCIM.getCCIM(ClusterIm, 4) return ClusterIm, ccImOneBase
def main(args): outputdir = os.path.dirname(args.vectors) #winidx_path = os.path.join(outputdir, # 'cos-distance_' + os.path.basename(args.weights)) map_path = os.path.splitext(args.vectors)[0] + '_som_map.npy' winidx_path = os.path.splitext(args.vectors)[0] + '_som_winidx.tsv' hist_path = os.path.splitext(args.vectors)[0] + '_som_hist.tsv' mode_path = os.path.splitext(args.vectors)[0] + '_som_mode.tsv' nearest_path = os.path.splitext(args.vectors)[0] + '_som_nearest.tsv' print('loading val...') val = utils.io.load_image_list(args.val) categories = utils.io.load_categories(args.categories) v = np.load(args.vectors) N = v.shape[0] d = v.shape[1] C = len(categories) def train(som, n): #pbar = tqdm(total=n) for i in tqdm(range(n)): r = np.random.randint(0, som.input_num) data = som.input_layer[r] win_idx = som._get_winner_node(data) som._update(win_idx, data, i) #pbar.update(1) #pbar.close() return som.output_layer.reshape( (som.shape[1], som.shape[0], som.input_dim)) output_shape = None if not args.resume: print('training...') output_shape = (args.mapsize[0], args.mapsize[1]) som = SOM(output_shape, v) som.set_parameter(neighbor=0.1, learning_rate=0.2) iteration = N if args.epoch != 0: iteration = N * args.epoch elif args.iteration != 0: iteration = args.iteration #output_map = som.train(1) output_map = train(som, iteration) np.save(map_path, output_map) else: output_map = np.load(args.resume) print(output_map.shape) output_shape = (output_map.shape[0], output_map.shape[1]) som = SOM(output_shape, v) som.set_parameter(neighbor=0.1, learning_rate=0.2) som.output_layer = output_map.reshape( (output_shape[0] * output_shape[1], -1)) print('testing...') hist_output_map = np.zeros((output_shape[0], output_shape[1], C), dtype=np.int32) pbar = tqdm(total=N) with open(winidx_path, 'w') as f: for i, (pair, vv) in enumerate(zip(val, v)): idx = som._get_winner_node(vv) hist_output_map[idx[0], idx[1], pair[1]] += 1 #print(idx) f.write(str(idx[0]) + '\t' + str(idx[1]) + '\n') pbar.update(1) pbar.close() np.savetxt(hist_path, hist_output_map.reshape((output_shape[0], -1)), delimiter='\t', fmt='%d') mode_category = hist_output_map.argmax(axis=2) print(mode_category) np.savetxt(mode_path, mode_category, delimiter='\t', fmt='%d') def get_nearest(som, input_data): nearest_idx = np.zeros(som.output_layer.shape[0]) for i, data in enumerate(tqdm(som.output_layer)): sub = input_data - data dis = np.linalg.norm(sub, axis=1) nearest_idx[i] = np.argmin(dis) return nearest_idx.reshape(som.shape[1], som.shape[0]) nearest_idx = get_nearest(som, v) np.savetxt(nearest_path, nearest_idx, delimiter='\t', fmt='%d')
import numpy as np from matplotlib import pyplot as plt from sompy import SOM input_data = np.random.rand(1000, 3) output_shape = (40, 40) som = SOM(output_shape, input_data) som.set_parameter(neighbor=0.26, learning_rate=0.22) output_map = som.train(10000) plt.imshow(output_map, interpolation='none') plt.show()
class WeightedMatrix: """docstring for WeightMatrix""" def __init__(self): self.number_of_documents = 0 #integer self.number_distinct_words = 0 #integer self.number_total_words = 0 #integer self.number_of_words_in_document = [ ] #list of integer (size: number of documents) self.document_list = [] #list of strings (size: number of documents) self.word_total_frequency = [ ] #list of integer (size: number of distinct words) self.word_list = [] #list of strings (size: number of distinct words) self.document_frequency = [ ] #list of integer (size: number of distinct words) self.TF_IDF_matrix = [ ] #list of list of floats (size: number of documents x number of words) self.word_frequency_matrix = [ ] #list of list of integer (size: number of documents x number of words) self.question_frequency_vector = [ ] #list of floats (size: number of distinct words) self.question_TF_IDF_vector = [ ] #list of floats (size: number of distinct words) self.SOM_cluster = None #one SOM object self.document_coordinates = [] #list of x , y pairs self.document_coordinates_counter = [] self.question_coordinates = None self.min_distance_cluster = None self.documents_in_cluster_indexes = [ ] #list of integer(size of the recall) self.most_similar_document_indexes = [] def __str__(self): return """Weighted Matrix number_of_documents = """ + str(self.number_of_documents) + """ number_distinct_words = """ + str(self.number_distinct_words) + """ number_total_words = """ + str(self.number_total_words) + """ number_of_words_in_document = """ + str( self.number_of_words_in_document) + """ word_total_frequency = """ + str(self.word_total_frequency) + """ question_frequency_vector = """ + str(self.question_frequency_vector) def insert_values(self, dir_fuente=default_origin): list_files = os.listdir(dir_fuente) set_words = set() for file_ in list_files: if file_ != ".DS_Store": self.number_of_documents += 1 self.document_list.append(file_) f = open(dir_fuente + file_) word_list = f.read().split() print len(word_list) self.number_of_words_in_document.append( int(len(self.word_list))) self.number_total_words += len(self.word_list) set_words |= set(word_list) #print self.number_of_words_in_document self.word_list = list(set_words) self.number_distinct_words = len(self.word_list) self.document_frequency = [0] * self.number_distinct_words self.word_total_frequency = [0] * self.number_distinct_words for file_ in self.document_list: new_row = [0] * self.number_distinct_words f = open(dir_fuente + file_) word_list = f.read().split() counter_word = Counter(word_list) for word in counter_word: new_row[self.word_list.index(word)] = counter_word[word] self.document_frequency[self.word_list.index(word)] += 1 self.word_total_frequency[self.word_list.index( word)] += counter_word[word] self.word_frequency_matrix.append(new_row) self.insert_TF_IDF_matrix() #print self #exit(-1) def insert_TF_IDF_matrix(self): for x in range(self.number_of_documents): new_row = [] for y in range(self.number_distinct_words): new_row.append(0) self.TF_IDF_matrix.append(new_row) for x in range(len(self.TF_IDF_matrix)): for y in range(len(self.TF_IDF_matrix[x])): if self.word_frequency_matrix[x][y]: self.TF_IDF_matrix[x][y] = (1 + math.log( float(self.word_frequency_matrix[x][y]))) * (math.log( (float(self.number_of_documents) / (float(self.document_frequency[y]))))) else: self.TF_IDF_matrix[x][y] = 0 def normalize_TF_IDF(self): for x in range(len(self.TF_IDF_matrix)): min_ = min(self.TF_IDF_matrix[x]) for y in range(len(self.TF_IDF_matrix[x])): self.TF_IDF_matrix[x][y] = self.TF_IDF_matrix[x][y] - min_ sum_ = sum(self.TF_IDF_matrix[x]) for y in range(len(self.TF_IDF_matrix[x])): self.TF_IDF_matrix[x][y] = self.TF_IDF_matrix[x][y] / sum_ for x in range(len(self.TF_IDF_matrix)): print sum(self.TF_IDF_matrix[x]) def remove_words_with_frequency(self, f): size_ = len(self.document_frequency) x = 0 while x < size_: if self.document_frequency[x] == f: size_ -= 1 self.document_frequency = self.document_frequency[: x] + self.document_frequency[ x + 1:] self.word_list = self.word_list[:x] + self.word_list[x + 1:] self.number_distinct_words += -1 self.word_total_frequency = self.word_total_frequency[: x] + self.word_total_frequency[ x + 1:] for y in range(len(self.word_frequency_matrix)): self.number_total_words += -1 * self.word_frequency_matrix[ y][x] self.number_of_words_in_document[ y] += -1 * self.word_frequency_matrix[y][x] self.TF_IDF_matrix[y] = self.TF_IDF_matrix[ y][:x] + self.TF_IDF_matrix[y][x + 1:] self.word_frequency_matrix[y] = self.word_frequency_matrix[ y][:x] + self.word_frequency_matrix[y][x + 1:] else: x += 1 def insert_question_vector(self, question_lemma_document): self.insert_new_document(question_lemma_document, 'question') self.question_vector = self.word_frequency_matrix[-1] #print self.word_frequency_matrix[-1] for word_index in range(len(self.word_frequency_matrix[-1])): self.question_TF_IDF_vector.append( self.TF_IDF_value(-1, word_index)) min_ = min(self.question_TF_IDF_vector) for x in range(len(self.question_TF_IDF_vector)): self.question_TF_IDF_vector[ x] = self.question_TF_IDF_vector[x] - min_ sum_ = sum(self.question_TF_IDF_vector) for x in range(len(self.question_TF_IDF_vector)): self.question_TF_IDF_vector[ x] = self.question_TF_IDF_vector[x] / sum_ def TF_IDF_value(self, doc_index, word_index): #print "doc_index " + str(doc_index) #print "word_index " + str(word_index) #print "len(word_frequency_matrix) " + str(len(self.word_frequency_matrix)) #print "len(word_frequency_matrix["+str(doc_index)+"]) " + str(len(self.word_frequency_matrix[doc_index])) #print "self.document_frequency " + str(len(self.document_frequency)) #print "return (1+math.log(float( " + str(self.word_frequency_matrix[doc_index][word_index]) + " )))*(math.log((float(" + str(self.number_of_documents) + ")/(float( " + str(self.document_frequency[word_index]) + " )))))" #print #print if self.word_frequency_matrix[doc_index][word_index]: print " TF_IDF mayor que 0: " + str( (1 + math.log( float(self.word_frequency_matrix[doc_index][word_index]))) * (math.log((float(self.number_of_documents) / (float(self.document_frequency[word_index])))))) print self.word_list[word_index] return (1 + math.log( float(self.word_frequency_matrix[doc_index][word_index]))) * ( math.log((float(self.number_of_documents) / (float(self.document_frequency[word_index]))))) else: return 0 def insert_word(self, word, document_name, frequency=1): doc_index = self.document_list.index(document_name) if word not in self.word_list: word_index = len(self.word_list) self.word_list.append(word) self.number_distinct_words += 1 self.word_total_frequency.append(frequency) self.word_list.append(word) self.document_frequency.append(1) for row_index in range(len(self.word_frequency_matrix)): self.word_frequency_matrix[row_index].append(0) else: word_index = self.word_list.index(word) self.word_total_frequency[word_index] += frequency if self.word_frequency_matrix[doc_index][word_index] == 0: self.document_frequency[word_index] += 1 self.number_of_words_in_document[doc_index] += frequency self.number_total_words += frequency self.word_frequency_matrix[doc_index][word_index] += frequency def insert_new_document(self, document, name): print "IS GONNA BE INSERTED A NEW DOCUMENT" f = open(document) words = f.read().split() print words self.number_of_documents += 1 self.document_list.append(name) self.word_frequency_matrix.append([0] * self.number_distinct_words) self.number_of_words_in_document.append(0) for word in words: self.insert_word(word, name) def question_cluster_distance(self): (q_x, q_y) = self.SOM_cluster.best_match(self.question_TF_IDF_vector) min_ = 1000 min_x = -1 min_y = -1 for (x, y) in self.document_coordinates: distance = math.sqrt((q_x - x)**2 + (q_y - y)**2) if distance < min_: min_ = distance min_x = x min_y = y print "Resultado Distancia:" print " (x,y) = (" + str(min_x) + ", " + str(min_y) + ")" self.min_distance_cluster = (min_x, min_y) return (min_x, min_y) def recall_document_result(self): for i in range(len(self.document_coordinates)): if self.document_coordinates[i] == self.min_distance_cluster: self.documents_in_cluster_indexes.append(i) def create_SOM(self, x, y, I, LR=0.05): if self.SOM_cluster == None: self.SOM_cluster = SOM(x, y, self.number_distinct_words, LR) self.SOM_cluster.train(I, self.TF_IDF_matrix) def test(self): coor_y = [] coor_x = [] clusters = [] if self.question_TF_IDF_vector == []: print "ERROR, Vector TF_IDF de pregunta aun no creado." else: for row in range(len(self.TF_IDF_matrix[:-1])): (x, y) = self.SOM_cluster.best_match( array(self.TF_IDF_matrix[row])) self.document_coordinates.append((x, y)) coor_y.append(y) coor_x.append(x) self.question_coordinates = self.SOM_cluster.best_match( array(self.question_TF_IDF_vector)) self.document_coordinates_counter = Counter(self.document_coordinates) self.question_cluster_distance() self.recall_document_result() print "Cantidad de documentos en el cluster mas cercano: " + str( self.document_coordinates_counter[self.min_distance_cluster]) print "Documentos contenidos dentro del cluster mas cercano" for i in self.documents_in_cluster_indexes: print "Nombre del documento: " + self.document_list[i] print print "Matriz de peso TF-IDF" print self.TF_IDF_matrix[i] print "Porcentaje que representa estos documentos: " + str( float(self.document_coordinates_counter[self.min_distance_cluster]) / float(self.number_of_documents) * 100.0) + " %" def similarity_measure_jacard(self): max_sim_number = -1000 max_document_index = -1 disordered_list = [] for j in range(len(self.TF_IDF_matrix[:-1])): sum_wq_wj = float( sum([ a * b for a, b in zip(self.TF_IDF_matrix[j], self.question_TF_IDF_vector) ])) sum_wq_2 = float(sum([a**2 for a in self.question_TF_IDF_vector])) sum_wj_2 = float(sum([a**2 for a in self.TF_IDF_matrix[j]])) jacard_sim_ = sum_wq_wj / (sum_wq_2 + sum_wj_2 - sum_wq_wj) disordered_list.append((j, self.document_list[j], jacard_sim_)) if jacard_sim_ > max_sim_number: max_sim_number = jacard_sim_ max_document_index = j #print "El documento con mayor Jacard similitud es: " + self.document_list[max_document_index] #print "Con un valor de similitud de: " + str(max_sim_number) self.most_similar_document_indexes = sorted( disordered_list, key=lambda similarity: similarity[2], reverse=True) def similarity_measure_cosine(self): max_sim_number = -1000 max_document_index = -1 disordered_list = [] for j in range(len(self.TF_IDF_matrix[:-1])): sum_wq_wj = float( sum([ a * b for a, b in zip(self.TF_IDF_matrix[j], self.question_TF_IDF_vector) ])) sum_wq_2 = float(sum([a**2 for a in self.question_TF_IDF_vector])) sum_wj_2 = float(sum([a**2 for a in self.TF_IDF_matrix[j]])) jacard_sim_ = sum_wq_wj / (sum_wq_2 * sum_wj_2)**.5 disordered_list.append((j, self.document_list[j], jacard_sim_)) if jacard_sim_ > max_sim_number: max_sim_number = jacard_sim_ max_document_index = j print "El documento con mayor Cosine similitud es: " + self.document_list[ max_document_index] print "Con un valor de similitud de: " + str(max_sim_number) self.most_similar_document_indexes = sorted( disordered_list, key=lambda similarity: similarity[2], reverse=True)
def create_SOM(self, x, y, I, LR=0.05): if self.SOM_cluster == None: self.SOM_cluster = SOM(x, y, self.number_distinct_words, LR) self.SOM_cluster.train(I, self.TF_IDF_matrix)
import numpy as np from sompy import SOM import matplotlib.pyplot as plt import matplotlib.animation as animation N = 20 colors = [[0, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 1, 1]] som = SOM((N, N), colors) som.set_parameter(neighbor=0.3) ims = [] for i in range(1000): m = som.train(10) img = np.array(m.tolist(), dtype=np.uint8) im = plt.imshow(m.tolist(), interpolation='none', animated=True) ims.append([im]) fig = plt.figure() ani = animation.ArtistAnimation(fig, ims, interval=100, blit=True, repeat_delay=1000) plt.show() # ani.save('dynamic_images.mp4')