示例#1
0
#!/usr/bin/python3.2
# -*-coding:Utf-8 -*

import math
import numpy as np
import random as rnd

from tfidfStorage import TFIDF_reader
from tfidfStorage import tfidf_cosine_distance as cosdist
from cluster import Cluster

reader = TFIDF_reader("test.vectors")

# Number of tfidf vectors
npoint = reader.doc_nb
# Number of clusters
ncluster = 100

# Initialize centroids so that they are as far from one another as possible
# Centroids indices in tfidf_list
#centroids_idx = [0]
# Distance of each point to the closest cluster
#mindist = np.array([tfidfdct.cosdist(tfidf_list[0]) for tfidfdct in tfidf_list])
#for i in range(1, ncluster) :
#	max_idx = np.argmax(mindist)
#	centroids_idx.append(max_idx)
#	dist = np.array([tfidfdct.cosdist(tfidf_list[max_idx]) for tfidfdct in tfidf_list])
#	mindist = np.amin(np.vstack((mindist, dist)), 0)

#clusters = [Cluster(tfidf_list[c_idx]) for c_idx in centroids_idx]
示例#2
0
                self.__class__(points_1, self.compute_distance, self.get_max_subnodes, self.sample_size),
                self.__class__(points_2, self.compute_distance, self.get_max_subnodes, self.sample_size),
            ]


def threshold_cosine(initial, max, max_count):
    def t():
        t.count += 1
        return float(initial) + float((max - initial) * t.count) / float(max_count)

    t.count = 0
    return t


if __name__ == "__main__":
    reader = TFIDF_reader("test.vectors")
    grgpf = GRGPF(
        tfidf_cosine_distance,
        reader.read_docId,
        limit_subnodes=10,
        limit_clusters_per_leaf=10,
        limit_total_clusters=1000,
        limit_total_nodes=200,
        k=10,
        sample_size=30,
        get_next_threshold=threshold_cosine(0.8, 0.99, 10),
    )
    for i in range(0, reader.doc_nb):
        print(i)
        if i != 0 and i % 100 == 0:  # recalcul des samples tout les 100 points
            print("Recomputing samples")