Exemplo n.º 1
0
def MySOM05(img1, ImageType, numClust):

    #normalizing image data
    img2 = img1.astype(float)
    img = img2 / 255

    #diff constants for use
    num_features = img.shape[2]
    num_samples = img.shape[0] * img.shape[1]
    img_height = img.shape[0]
    img_width = img.shape[1]

    #reshape img
    img_vector = np.reshape(img, (num_samples, num_features))

    #for hyper simply change the img_vector after pca with 3 dimensions
    if (ImageType == 'Hyper'):
        pca = PCA(3)
        principalComponents = pca.fit_transform(img_vector)
        img_vector = principalComponents
    #map sizeis 25X25
    N = 25
    som = SOM((N, N), img_vector)
    som.set_parameter(neighbor=0.1, learning_rate=0.2)
    output_map = som.train(100000)

    # print output_map.shape
    somMap = output_map.reshape([output_map.shape[0] * output_map.shape[1], 3])

    # kmeans to find new centroids
    kmeans2 = sklearn.cluster.KMeans(n_clusters=numClust,
                                     init="k-means++",
                                     max_iter=10).fit(somMap)
    uT = kmeans2.labels_
    new_centroids = kmeans2.cluster_centers_

    # use new_centroids for kmeans of 1 iteration only to find final output
    kmeans3 = sklearn.cluster.KMeans(n_clusters=numClust,
                                     init=new_centroids,
                                     n_init=1,
                                     max_iter=300).fit(img_vector)

    ClusterIm = np.reshape(kmeans3.labels_, (img_height, img_width))
    ccImOneBase = getCCIM.getCCIM(ClusterIm, 4)
    return ClusterIm, ccImOneBase
Exemplo n.º 2
0
def main(args):
    outputdir = os.path.dirname(args.vectors)
    #winidx_path = os.path.join(outputdir,
    #    'cos-distance_' + os.path.basename(args.weights))
    map_path = os.path.splitext(args.vectors)[0] + '_som_map.npy'
    winidx_path = os.path.splitext(args.vectors)[0] + '_som_winidx.tsv'
    hist_path = os.path.splitext(args.vectors)[0] + '_som_hist.tsv'
    mode_path = os.path.splitext(args.vectors)[0] + '_som_mode.tsv'
    nearest_path = os.path.splitext(args.vectors)[0] + '_som_nearest.tsv'

    print('loading val...')
    val = utils.io.load_image_list(args.val)
    categories = utils.io.load_categories(args.categories)

    v = np.load(args.vectors)
    N = v.shape[0]
    d = v.shape[1]
    C = len(categories)

    def train(som, n):
        #pbar = tqdm(total=n)
        for i in tqdm(range(n)):
            r = np.random.randint(0, som.input_num)
            data = som.input_layer[r]
            win_idx = som._get_winner_node(data)
            som._update(win_idx, data, i)
            #pbar.update(1)
        #pbar.close()
        return som.output_layer.reshape(
            (som.shape[1], som.shape[0], som.input_dim))

    output_shape = None
    if not args.resume:
        print('training...')
        output_shape = (args.mapsize[0], args.mapsize[1])
        som = SOM(output_shape, v)
        som.set_parameter(neighbor=0.1, learning_rate=0.2)

        iteration = N
        if args.epoch != 0:
            iteration = N * args.epoch
        elif args.iteration != 0:
            iteration = args.iteration

            #output_map = som.train(1)
        output_map = train(som, iteration)
        np.save(map_path, output_map)
    else:
        output_map = np.load(args.resume)
        print(output_map.shape)
        output_shape = (output_map.shape[0], output_map.shape[1])
        som = SOM(output_shape, v)
        som.set_parameter(neighbor=0.1, learning_rate=0.2)
        som.output_layer = output_map.reshape(
            (output_shape[0] * output_shape[1], -1))

    print('testing...')
    hist_output_map = np.zeros((output_shape[0], output_shape[1], C),
                               dtype=np.int32)
    pbar = tqdm(total=N)
    with open(winidx_path, 'w') as f:
        for i, (pair, vv) in enumerate(zip(val, v)):
            idx = som._get_winner_node(vv)
            hist_output_map[idx[0], idx[1], pair[1]] += 1
            #print(idx)
            f.write(str(idx[0]) + '\t' + str(idx[1]) + '\n')
            pbar.update(1)
    pbar.close()
    np.savetxt(hist_path,
               hist_output_map.reshape((output_shape[0], -1)),
               delimiter='\t',
               fmt='%d')
    mode_category = hist_output_map.argmax(axis=2)
    print(mode_category)
    np.savetxt(mode_path, mode_category, delimiter='\t', fmt='%d')

    def get_nearest(som, input_data):
        nearest_idx = np.zeros(som.output_layer.shape[0])
        for i, data in enumerate(tqdm(som.output_layer)):
            sub = input_data - data
            dis = np.linalg.norm(sub, axis=1)
            nearest_idx[i] = np.argmin(dis)
        return nearest_idx.reshape(som.shape[1], som.shape[0])

    nearest_idx = get_nearest(som, v)
    np.savetxt(nearest_path, nearest_idx, delimiter='\t', fmt='%d')
Exemplo n.º 3
0
import numpy as np

from matplotlib import pyplot as plt

from sompy import SOM

input_data = np.random.rand(1000, 3)

output_shape = (40, 40)

som = SOM(output_shape, input_data)

som.set_parameter(neighbor=0.26, learning_rate=0.22)

output_map = som.train(10000)

plt.imshow(output_map, interpolation='none')
plt.show()
Exemplo n.º 4
0
class WeightedMatrix:
    """docstring for WeightMatrix"""
    def __init__(self):
        self.number_of_documents = 0  #integer
        self.number_distinct_words = 0  #integer
        self.number_total_words = 0  #integer
        self.number_of_words_in_document = [
        ]  #list of integer (size: number of documents)
        self.document_list = []  #list of strings (size: number of documents)
        self.word_total_frequency = [
        ]  #list of integer (size: number of distinct words)
        self.word_list = []  #list of strings (size: number of distinct words)
        self.document_frequency = [
        ]  #list of integer (size: number of distinct words)
        self.TF_IDF_matrix = [
        ]  #list of list of floats (size: number of documents x number of words)
        self.word_frequency_matrix = [
        ]  #list of list of integer (size: number of documents x number of words)
        self.question_frequency_vector = [
        ]  #list of floats (size: number of distinct words)
        self.question_TF_IDF_vector = [
        ]  #list of floats (size: number of distinct words)
        self.SOM_cluster = None  #one SOM object
        self.document_coordinates = []  #list of x , y pairs
        self.document_coordinates_counter = []
        self.question_coordinates = None
        self.min_distance_cluster = None
        self.documents_in_cluster_indexes = [
        ]  #list of integer(size of the recall)
        self.most_similar_document_indexes = []

    def __str__(self):
        return """Weighted Matrix
		number_of_documents = """ + str(self.number_of_documents) + """
		number_distinct_words = """ + str(self.number_distinct_words) + """
		number_total_words = """ + str(self.number_total_words) + """
		number_of_words_in_document = """ + str(
            self.number_of_words_in_document) + """
		word_total_frequency = """ + str(self.word_total_frequency) + """
		question_frequency_vector = """ + str(self.question_frequency_vector)

    def insert_values(self, dir_fuente=default_origin):
        list_files = os.listdir(dir_fuente)
        set_words = set()
        for file_ in list_files:
            if file_ != ".DS_Store":
                self.number_of_documents += 1
                self.document_list.append(file_)
                f = open(dir_fuente + file_)
                word_list = f.read().split()
                print len(word_list)
                self.number_of_words_in_document.append(
                    int(len(self.word_list)))
                self.number_total_words += len(self.word_list)
                set_words |= set(word_list)
        #print self.number_of_words_in_document
        self.word_list = list(set_words)
        self.number_distinct_words = len(self.word_list)
        self.document_frequency = [0] * self.number_distinct_words
        self.word_total_frequency = [0] * self.number_distinct_words
        for file_ in self.document_list:
            new_row = [0] * self.number_distinct_words
            f = open(dir_fuente + file_)
            word_list = f.read().split()
            counter_word = Counter(word_list)
            for word in counter_word:
                new_row[self.word_list.index(word)] = counter_word[word]
                self.document_frequency[self.word_list.index(word)] += 1
                self.word_total_frequency[self.word_list.index(
                    word)] += counter_word[word]

            self.word_frequency_matrix.append(new_row)
        self.insert_TF_IDF_matrix()
        #print self
        #exit(-1)

    def insert_TF_IDF_matrix(self):
        for x in range(self.number_of_documents):
            new_row = []
            for y in range(self.number_distinct_words):
                new_row.append(0)
            self.TF_IDF_matrix.append(new_row)
        for x in range(len(self.TF_IDF_matrix)):
            for y in range(len(self.TF_IDF_matrix[x])):
                if self.word_frequency_matrix[x][y]:
                    self.TF_IDF_matrix[x][y] = (1 + math.log(
                        float(self.word_frequency_matrix[x][y]))) * (math.log(
                            (float(self.number_of_documents) /
                             (float(self.document_frequency[y])))))
                else:
                    self.TF_IDF_matrix[x][y] = 0

    def normalize_TF_IDF(self):
        for x in range(len(self.TF_IDF_matrix)):
            min_ = min(self.TF_IDF_matrix[x])
            for y in range(len(self.TF_IDF_matrix[x])):
                self.TF_IDF_matrix[x][y] = self.TF_IDF_matrix[x][y] - min_
            sum_ = sum(self.TF_IDF_matrix[x])
            for y in range(len(self.TF_IDF_matrix[x])):
                self.TF_IDF_matrix[x][y] = self.TF_IDF_matrix[x][y] / sum_
        for x in range(len(self.TF_IDF_matrix)):
            print sum(self.TF_IDF_matrix[x])

    def remove_words_with_frequency(self, f):
        size_ = len(self.document_frequency)
        x = 0
        while x < size_:
            if self.document_frequency[x] == f:
                size_ -= 1
                self.document_frequency = self.document_frequency[:
                                                                  x] + self.document_frequency[
                                                                      x + 1:]
                self.word_list = self.word_list[:x] + self.word_list[x + 1:]
                self.number_distinct_words += -1
                self.word_total_frequency = self.word_total_frequency[:
                                                                      x] + self.word_total_frequency[
                                                                          x +
                                                                          1:]
                for y in range(len(self.word_frequency_matrix)):
                    self.number_total_words += -1 * self.word_frequency_matrix[
                        y][x]
                    self.number_of_words_in_document[
                        y] += -1 * self.word_frequency_matrix[y][x]
                    self.TF_IDF_matrix[y] = self.TF_IDF_matrix[
                        y][:x] + self.TF_IDF_matrix[y][x + 1:]
                    self.word_frequency_matrix[y] = self.word_frequency_matrix[
                        y][:x] + self.word_frequency_matrix[y][x + 1:]
            else:
                x += 1

    def insert_question_vector(self, question_lemma_document):
        self.insert_new_document(question_lemma_document, 'question')
        self.question_vector = self.word_frequency_matrix[-1]
        #print self.word_frequency_matrix[-1]
        for word_index in range(len(self.word_frequency_matrix[-1])):
            self.question_TF_IDF_vector.append(
                self.TF_IDF_value(-1, word_index))
        min_ = min(self.question_TF_IDF_vector)
        for x in range(len(self.question_TF_IDF_vector)):
            self.question_TF_IDF_vector[
                x] = self.question_TF_IDF_vector[x] - min_
        sum_ = sum(self.question_TF_IDF_vector)
        for x in range(len(self.question_TF_IDF_vector)):
            self.question_TF_IDF_vector[
                x] = self.question_TF_IDF_vector[x] / sum_

    def TF_IDF_value(self, doc_index, word_index):
        #print "doc_index " + str(doc_index)
        #print "word_index " + str(word_index)
        #print "len(word_frequency_matrix) " + str(len(self.word_frequency_matrix))
        #print "len(word_frequency_matrix["+str(doc_index)+"]) " + str(len(self.word_frequency_matrix[doc_index]))
        #print "self.document_frequency " + str(len(self.document_frequency))
        #print "return (1+math.log(float( " + str(self.word_frequency_matrix[doc_index][word_index]) + " )))*(math.log((float(" +  str(self.number_of_documents) + ")/(float( " + str(self.document_frequency[word_index]) + " )))))"
        #print
        #print
        if self.word_frequency_matrix[doc_index][word_index]:
            print "    TF_IDF mayor que 0:   " + str(
                (1 + math.log(
                    float(self.word_frequency_matrix[doc_index][word_index])))
                * (math.log((float(self.number_of_documents) /
                             (float(self.document_frequency[word_index]))))))
            print self.word_list[word_index]
            return (1 + math.log(
                float(self.word_frequency_matrix[doc_index][word_index]))) * (
                    math.log((float(self.number_of_documents) /
                              (float(self.document_frequency[word_index])))))
        else:
            return 0

    def insert_word(self, word, document_name, frequency=1):
        doc_index = self.document_list.index(document_name)

        if word not in self.word_list:
            word_index = len(self.word_list)
            self.word_list.append(word)
            self.number_distinct_words += 1
            self.word_total_frequency.append(frequency)
            self.word_list.append(word)
            self.document_frequency.append(1)
            for row_index in range(len(self.word_frequency_matrix)):
                self.word_frequency_matrix[row_index].append(0)
        else:
            word_index = self.word_list.index(word)
            self.word_total_frequency[word_index] += frequency
            if self.word_frequency_matrix[doc_index][word_index] == 0:
                self.document_frequency[word_index] += 1
        self.number_of_words_in_document[doc_index] += frequency
        self.number_total_words += frequency
        self.word_frequency_matrix[doc_index][word_index] += frequency

    def insert_new_document(self, document, name):
        print "IS GONNA BE INSERTED A NEW DOCUMENT"
        f = open(document)
        words = f.read().split()
        print words
        self.number_of_documents += 1
        self.document_list.append(name)
        self.word_frequency_matrix.append([0] * self.number_distinct_words)
        self.number_of_words_in_document.append(0)
        for word in words:
            self.insert_word(word, name)

    def question_cluster_distance(self):
        (q_x, q_y) = self.SOM_cluster.best_match(self.question_TF_IDF_vector)
        min_ = 1000
        min_x = -1
        min_y = -1
        for (x, y) in self.document_coordinates:
            distance = math.sqrt((q_x - x)**2 + (q_y - y)**2)
            if distance < min_:
                min_ = distance
                min_x = x
                min_y = y
        print "Resultado Distancia:"
        print "		(x,y) = (" + str(min_x) + ", " + str(min_y) + ")"
        self.min_distance_cluster = (min_x, min_y)
        return (min_x, min_y)

    def recall_document_result(self):
        for i in range(len(self.document_coordinates)):
            if self.document_coordinates[i] == self.min_distance_cluster:
                self.documents_in_cluster_indexes.append(i)

    def create_SOM(self, x, y, I, LR=0.05):
        if self.SOM_cluster == None:
            self.SOM_cluster = SOM(x, y, self.number_distinct_words, LR)
        self.SOM_cluster.train(I, self.TF_IDF_matrix)

    def test(self):
        coor_y = []
        coor_x = []
        clusters = []
        if self.question_TF_IDF_vector == []:
            print "ERROR, Vector TF_IDF de pregunta aun no creado."
        else:
            for row in range(len(self.TF_IDF_matrix[:-1])):
                (x, y) = self.SOM_cluster.best_match(
                    array(self.TF_IDF_matrix[row]))
                self.document_coordinates.append((x, y))
                coor_y.append(y)
                coor_x.append(x)
            self.question_coordinates = self.SOM_cluster.best_match(
                array(self.question_TF_IDF_vector))
        self.document_coordinates_counter = Counter(self.document_coordinates)
        self.question_cluster_distance()
        self.recall_document_result()
        print "Cantidad de documentos en el cluster mas cercano: " + str(
            self.document_coordinates_counter[self.min_distance_cluster])
        print "Documentos contenidos dentro del cluster mas cercano"
        for i in self.documents_in_cluster_indexes:
            print "Nombre del documento: " + self.document_list[i]
            print
            print "Matriz de peso TF-IDF"
            print self.TF_IDF_matrix[i]
        print "Porcentaje que representa estos documentos: " + str(
            float(self.document_coordinates_counter[self.min_distance_cluster])
            / float(self.number_of_documents) * 100.0) + " %"

    def similarity_measure_jacard(self):
        max_sim_number = -1000
        max_document_index = -1
        disordered_list = []
        for j in range(len(self.TF_IDF_matrix[:-1])):
            sum_wq_wj = float(
                sum([
                    a * b for a, b in zip(self.TF_IDF_matrix[j],
                                          self.question_TF_IDF_vector)
                ]))
            sum_wq_2 = float(sum([a**2 for a in self.question_TF_IDF_vector]))
            sum_wj_2 = float(sum([a**2 for a in self.TF_IDF_matrix[j]]))
            jacard_sim_ = sum_wq_wj / (sum_wq_2 + sum_wj_2 - sum_wq_wj)
            disordered_list.append((j, self.document_list[j], jacard_sim_))
            if jacard_sim_ > max_sim_number:
                max_sim_number = jacard_sim_
                max_document_index = j
        #print "El documento con mayor Jacard similitud es: " + self.document_list[max_document_index]
        #print "Con un valor de similitud de: " + str(max_sim_number)
        self.most_similar_document_indexes = sorted(
            disordered_list,
            key=lambda similarity: similarity[2],
            reverse=True)

    def similarity_measure_cosine(self):
        max_sim_number = -1000
        max_document_index = -1
        disordered_list = []
        for j in range(len(self.TF_IDF_matrix[:-1])):
            sum_wq_wj = float(
                sum([
                    a * b for a, b in zip(self.TF_IDF_matrix[j],
                                          self.question_TF_IDF_vector)
                ]))
            sum_wq_2 = float(sum([a**2 for a in self.question_TF_IDF_vector]))
            sum_wj_2 = float(sum([a**2 for a in self.TF_IDF_matrix[j]]))
            jacard_sim_ = sum_wq_wj / (sum_wq_2 * sum_wj_2)**.5
            disordered_list.append((j, self.document_list[j], jacard_sim_))
            if jacard_sim_ > max_sim_number:
                max_sim_number = jacard_sim_
                max_document_index = j
        print "El documento con mayor Cosine similitud es: " + self.document_list[
            max_document_index]
        print "Con un valor de similitud de: " + str(max_sim_number)
        self.most_similar_document_indexes = sorted(
            disordered_list,
            key=lambda similarity: similarity[2],
            reverse=True)
Exemplo n.º 5
0
 def create_SOM(self, x, y, I, LR=0.05):
     if self.SOM_cluster == None:
         self.SOM_cluster = SOM(x, y, self.number_distinct_words, LR)
     self.SOM_cluster.train(I, self.TF_IDF_matrix)
Exemplo n.º 6
0
import numpy as np
from sompy import SOM
import matplotlib.pyplot as plt
import matplotlib.animation as animation

N = 20
colors = [[0, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 1, 1]]
som = SOM((N, N), colors)
som.set_parameter(neighbor=0.3)
ims = []
for i in range(1000):
    m = som.train(10)
    img = np.array(m.tolist(), dtype=np.uint8)
    im = plt.imshow(m.tolist(), interpolation='none', animated=True)
    ims.append([im])
fig = plt.figure()
ani = animation.ArtistAnimation(fig, ims, interval=100, blit=True, repeat_delay=1000)
plt.show()
# ani.save('dynamic_images.mp4')
Exemplo n.º 7
0
import numpy as np
from sompy import SOM
import matplotlib.pyplot as plt
import matplotlib.animation as animation

N = 20
colors = [[0, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 1, 1]]
som = SOM((N, N), colors)
som.set_parameter(neighbor=0.3)
ims = []
for i in range(1000):
    m = som.train(10)
    img = np.array(m.tolist(), dtype=np.uint8)
    im = plt.imshow(m.tolist(), interpolation='none', animated=True)
    ims.append([im])
fig = plt.figure()
ani = animation.ArtistAnimation(fig,
                                ims,
                                interval=100,
                                blit=True,
                                repeat_delay=1000)
plt.show()
# ani.save('dynamic_images.mp4')