示例#1
0
    def fit(self, data, norm=False):
        '''
        Args:
            data numpy.ndarray: [m, n] m samples every sample with n dimention
            norm boolean: False as default, 
        '''
        if not norm:
            self.mean = np.mean(data, axis=0)
            centered = data - self.mean
            process_data = self.l2norm(centered)
        else:
            process_data = data
        #clusterid, error, nfound = kcluster(process_data, nclusters=self.ncluster)#dist="u"
        #cdata, cmask = clustercentroids(process_data, mask=None, transpose=0, clusterid=clusterid, method='a')
        skm = SphericalKMeans(n_clusters=self.ncluster, verbose=0)
        skm.fit(process_data)
        self.clusters = skm.cluster_centers_
        self.clusterid = skm.labels_
        self.loss = skm.inertia_
        scores = []
        for i in range(self.ncluster):
            idxs = np.where(self.clusterid == i)[0].tolist()
            cluster_data = process_data[idxs, :]
            confs = np.dot(cluster_data, self.clusters[i, :].T)
            #print(confs)
            score = np.mean(confs)
            scores.append(score)
        #print(scores)
        self.main_id = np.argmin(scores)
        print(self.main_id)

        return self.clusters, self.clusterid, self.main_id
示例#2
0
def calc_logit_regress_stats(inputs, outputs, plot_name, K_SIZE):
    skm = SphericalKMeans(n_clusters=K_SIZE)
    skm.fit(inputs)
    input_labels = skm.labels_

    out_keys = list(set(outputs))
    out_idx_mapping = {out: idx for idx, out in enumerate(out_keys)}
    #out_key_list = [out_idx_mapping[key] for key in out_keys]
    print(out_idx_mapping)

    k_center_labels = [[0] * K_SIZE for x in range(len(out_keys))]
    for k_c, lab in zip(input_labels, outputs):
        out_idx = out_idx_mapping[lab]
        k_center_labels[out_idx][k_c] += 1

    k_center_labels = np.asarray(k_center_labels)
    ind = np.arange(K_SIZE)
    plots = []
    bottom = np.zeros(K_SIZE)
    for x in range(len(out_keys)):
        plots.append(plt.bar(ind, k_center_labels[x], bottom=bottom))
        bottom += k_center_labels[x]

    plt.title('Song genres in spherical k-means clusters')
    plt.xticks(ind, ["K" + str(i + 1) for i in range(K_SIZE)])
    #plt.yticks(np.arange(0, 81, 10))
    plt.legend(plots, out_keys)
    plt.savefig(plot_name)
def perform_clustering(seed, m_data, labels, n_clusters):
    # Singleview spherical kmeans clustering
    # Cluster each view separately
    s_kmeans = SphericalKMeans(n_clusters=n_clusters,
                               random_state=seed,
                               n_init=100)
    s_clusters_v1 = s_kmeans.fit_predict(m_data[0])
    s_clusters_v2 = s_kmeans.fit_predict(m_data[1])

    # Concatenate the multiple views into a single view
    s_data = np.hstack(m_data)
    s_clusters = s_kmeans.fit_predict(s_data)

    # Compute nmi between true class labels and singleview cluster labels
    s_nmi_v1 = nmi_score(labels, s_clusters_v1)
    s_nmi_v2 = nmi_score(labels, s_clusters_v2)
    s_nmi = nmi_score(labels, s_clusters)
    print('Singleview View 1 NMI Score: {0:.3f}\n'.format(s_nmi_v1))
    print('Singleview View 2 NMI Score: {0:.3f}\n'.format(s_nmi_v2))
    print('Singleview Concatenated NMI Score: {0:.3f}\n'.format(s_nmi))

    # Multiview spherical kmeans clustering

    # Use the MultiviewKMeans instance to cluster the data
    m_kmeans = MultiviewSphericalKMeans(n_clusters=n_clusters,
                                        n_init=100,
                                        random_state=seed)
    m_clusters = m_kmeans.fit_predict(m_data)

    # Compute nmi between true class labels and multiview cluster labels
    m_nmi = nmi_score(labels, m_clusters)
    print('Multiview NMI Score: {0:.3f}\n'.format(m_nmi))

    return m_clusters
示例#4
0
def testSpericalKMeans():
    # Find K clusters from data matrix X (n_examples x n_features)
    # spherical k-means

    skm = SphericalKMeans(n_clusters=3)
    skm.fit(X)
    print(skm.labels_)
示例#5
0
def term_clustering(terms: List[str], wv: Dict[str, np.ndarray],
                    n_clusters: int) -> Tuple[List[int], List[str]]:
    """Use spherical k-means to cluster word vectors.

  Args:
    terms: A list of terms to cluster.
    wv: A dictionary of word to their vectors.
    n_clusters: Number of output clusters.

  Returns:
    labels: A list of clustering assignment for each word.
    terms: A list of words, aligned with labels.
  """
    X = []
    X_terms = []
    n_out_of_vocab = 0
    logger.debug(f"#wv {len(wv)}")
    logger.debug(terms[:20])
    for term in terms:
        try:
            phrase = term
            emb = wv[phrase]
            X.append(emb)
            X_terms.append(phrase)
        except KeyError as e:
            n_out_of_vocab += 1

    logger.warning(f"{n_out_of_vocab} / {len(terms)} words out of vocab")
    logger.info(f"Clustering {len(X)} words")
    clus = SphericalKMeans(n_clusters=n_clusters)
    clus.fit(X)
    logger.info(f"Clustering complete")
    return clus.labels_, X_terms
示例#6
0
 def __init__(self, data, n_cluster):
     self.data = data
     self.n_cluster = n_cluster
     self.clus = SphericalKMeans(n_cluster)
     self.clusters = defaultdict(list)  # cluster id -> members
     self.membership = None  # a list contain the membership of the data points
     self.center_ids = None  # a list contain the ids of the cluster centers
     self.inertia_scores = None
def doc_clustering(model, cluster_num):
    doc_num = len(model.docvecs.doctags.keys())
    train_data = np.array(
        [model.docvecs['a_' + str(doc + 1)] for doc in range(doc_num)])
    clusterer = SphericalKMeans(cluster_num)
    print('Start clustering...')
    clusterer.fit(train_data)
    print('Done.')
    return clusterer
示例#8
0
 def __init__(self, data, n_cluster):
     self.data = data
     self.n_cluster = n_cluster
     self.clus = SphericalKMeans(
         n_cluster)  # Change by Mili (added Random State)
     #self.clus = KMeans(n_cluster)
     self.clusters = defaultdict(list)  # cluster id -> members
     self.membership = None  # a list contain the membership of the data points
     self.center_ids = None  # a list contain the ids of the cluster centers
     self.inertia_scores = None
     self.old2new_clusterid = None
示例#9
0
def kmeans_codebook(patches, k=30):
    shape = patches[0].shape

    x = patches.reshape(-1, shape[0] * shape[1])
    # normalize
    #x = x / ( 1e-6 + x.sum(axis=1, keepdims=True) )

    est = SphericalKMeans(k)
    #est = KMeans(n_clusters=k)
    est.fit(x)

    codebook = est.cluster_centers_.reshape(-1, shape[0], shape[1])
    return codebook
示例#10
0
 def _init_match(self):
     skm = SphericalKMeans(n_clusters=self.config['cluster_nums'],
                           init='k-means++',
                           n_init=20)
     data = self.data
     data = data[data['qs_embed'].apply(
         lambda x: True if np.linalg.norm(x) > 0 else False)]
     skm.fit(data['qs_embed'].tolist())
     data['skm_label'] = skm.labels_
     data = data[['qid', 'skm_label']]
     self.data = pd.merge(self.data, data, how='left', on=['qid'])
     self.data['skm_label'] = self.data['skm_label'].fillna(-1)
     self._cluster_centers = skm.cluster_centers_
示例#11
0
文件: utils.py 项目: wss1996/AWOE
 def cluster(self, docs, k):
     vecs = []
     words = []
     cnt = 0
     for doc in docs:
         cnt += 1
         #print('processing doc {}'.format(cnt), end='\r')
         ws = self.extract_keywords(doc)
         words.append(ws)
         vecs.append(self.sent2vec(ws))
     print('processing doc {} over.'.format(cnt))
     skm = SphericalKMeans(n_clusters=k)
     result = skm.fit(np.array(vecs))
     return result.labels_, words
def SphericalkMeansCluster(X,nfclusters):
	# Find K clusters from data matrix X (n_examples x n_features)

	# spherical k-means
	
	skm = SphericalKMeans(nfclusters)
	skm.fit(X)

	#print(skm.cluster_centers_)
	#print("Labels =")
	#print(skm.labels_)
	#print("Inertia = ")
	#print(nfclusters,skm.inertia_)
	#return skm.inertia_
	return skm.labels_
class Clusterer:
    def __init__(self, data, n_cluster):
        self.data = data
        self.n_cluster = n_cluster
        self.clus = SphericalKMeans(n_cluster)
        self.clusters = defaultdict(list)  # cluster id -> members
        self.membership = None  # a list contain the membership of the data points
        self.center_ids = None  # a list contain the ids of the cluster centers
        self.inertia_scores = None

    def fit(self):
        print("bbbbbbb")
        self.clus.fit(self.data)
        print("bbbbbbb")
        labels = self.clus.labels_
        print("bbbbbbb")
        for idx, label in enumerate(labels):
            self.clusters[label].append(idx)
        print("bbbbbbb")
        self.membership = labels
        print("bbbbbbb")
        self.center_ids = self.gen_center_idx()
        print("bbbbbbb")
        self.inertia_scores = self.clus.inertia_
        print('Clustering concentration score:', self.inertia_scores)

    # find the idx of each cluster center
    def gen_center_idx(self):
        ret = []
        for cluster_id in range(self.n_cluster):
            center_idx = self.find_center_idx_for_one_cluster(cluster_id)
            ret.append((cluster_id, center_idx))
        return ret

    def find_center_idx_for_one_cluster(self, cluster_id):
        query_vec = self.clus.cluster_centers_[cluster_id]
        members = self.clusters[cluster_id]
        best_similarity, ret = -1, -1
        for member_idx in members:
            member_vec = self.data[member_idx]
            cosine_sim = self.calc_cosine(query_vec, member_vec)
            if cosine_sim > best_similarity:
                best_similarity = cosine_sim
                ret = member_idx
        return ret

    def calc_cosine(self, vec_a, vec_b):
        return 1 - cosine(vec_a, vec_b)
示例#14
0
 def cluster(self, rounds = 20, clust_range = [2,12], num_cores = 1, threshold = 0.1, embeddings = []):
     
     
     if (len(self.embeddings_) == 0) and (len(embeddings) == 0):
         raise RuntimeError("No speaker embeddings available.")
         
     # If embeddings are not given
     if len(embeddings) == 0:
         embeddings = self.embeddings_
         
     else:
         self.embeddings_ = embeddings
         
                  
     # Top Two Silhouettes
     opt_center_num, center_dict = Top2S(embeddings, clust_range = clust_range, 
                                    rounds = rounds, num_cores = num_cores, threshold = threshold)
     self.centers_ = center_dict
     self.opt_speaker_num_ = opt_center_num   
     
     # Get speaker labels 
     spkmeans = SphericalKMeans(n_clusters=len(center_dict[opt_center_num]), 
                                                init = center_dict[opt_center_num], 
                                                max_iter=1, n_init=1, n_jobs=1).fit(embeddings)  
     self.speaker_labels_ = spkmeans.labels_+1 
示例#15
0
    def fit(
        self,
        data: List[Iterator[float]],
        find_n: bool = False
    ) -> Dict[str, Union[List[int], Union[float, None]]]:
        """Cluster the input data into n clusters.

        Args:
            data: A list of vectors.
            find_n: If True, don't use self.n_cluster but find n using
                elbow analysis instead
        Return:
            A list of integers as class labels. The order of the list
            corresponds to the order of the input data.
        """
        if find_n:
            self.n_clusters = 5  # self._get_n()
        if self.clus_type == 'kmeans':
            self.cluster = k_means(n_clusters=self.n_clusters)
        elif self.clus_type == 'sphericalkmeans':
            self.cluster = SphericalKMeans(n_clusters=self.n_clusters)
        elif self.clus_type == 'agglomerative':
            self.cluster = AgglomerativeClustering(n_clusters=self.n_clusters,
                                                   affinity=self.affinity,
                                                   linkage=self.linkage)

        self.cluster.fit(data)
        self._calc_density()

        return {'labels': self.cluster.labels_, 'density': self.compactness}
示例#16
0
def create_clustering_methods(ngroups, g_matrix, n_init):
    clustering_methods = {
        "kmsim":
        (KMeansSim(n_clusters=ngroups, g_matrix=g_matrix,
                   n_init=n_init), partial(dist_on_sphere, g_matrix=g_matrix)),
        "krbsim": (RepeatedBisectionSim(n_clusters=ngroups,
                                        g_matrix=g_matrix,
                                        n_init=n_init,
                                        bm='agg'),
                   partial(dist_on_sphere, g_matrix=g_matrix)),
        "skm": (SphericalKMeans(n_clusters=ngroups, n_init=n_init),
                partial(dist_on_sphere, g_matrix=g_matrix)),
        # "vmfs" : (VonMisesFisherMixture(n_clusters=ngroups, n_init=n_init, posterior_type='soft'), partial(dist_on_sphere, g_matrix=g_matrix)),
        # "vmfh" : (VonMisesFisherMixture(n_clusters=ngroups, n_init=n_init, posterior_type='hard'), partial(dist_on_sphere, g_matrix=g_matrix)),
        "km": (KMeans(n_clusters=ngroups,
                      n_init=n_init), partial(dist_on_sphere,
                                              g_matrix=g_matrix)),
        "lgr":
        (sklearn.linear_model.LogisticRegression(random_state=0,
                                                 solver='lbfgs',
                                                 multi_class='multinomial',
                                                 max_iter=500),
         partial(dist_on_sphere, g_matrix=g_matrix)),
    }
    return clustering_methods
示例#17
0
    def __init__(self,
                 data,
                 n_cluster,
                 method="soft-movMF",
                 init="random-class",
                 n_init=10,
                 n_jobs=1):
        self.data = data
        self.n_cluster = n_cluster
        self.method = method

        if method == "spk":
            self.clus = SphericalKMeans(n_clusters=n_cluster)
        elif method == "hard-movMF":
            self.clus = VonMisesFisherMixture(n_clusters=n_cluster,
                                              posterior_type='hard',
                                              init=init,
                                              n_init=n_init,
                                              n_jobs=n_jobs)
        elif method == "soft-movMF":
            self.clus = VonMisesFisherMixture(n_clusters=n_cluster,
                                              posterior_type='soft',
                                              init=init,
                                              n_init=n_init,
                                              n_jobs=n_jobs)

        self.clusters = {
        }  # cluster id -> dict(element_id: distance to center)
        self.clusters_phrase = {}  # cluster id -> representative words
        self.membership = None  # a list contain the membership of the data points
        self.center_ids = None  # a list contain the ids of the cluster centers
        self.inertia_scores = None
示例#18
0
    def visualize(self, indices = [], center_num = 0, 
                  ref_labels = [], use_colors = True):
        
        
        # If indices are not given
        if len(indices) ==0:
            indices = np.arange(len(self.embeddings_))
        
        # If center number is not given
        if center_num == 0:
            center_num = self.opt_speaker_num_
                
        # If reference labels are used
        if len(ref_labels) != 0:
            speaker_labels = ref_labels   
            
        # Allow visualization of different center number configurations
        else:        
            # Get speaker labels 
            spkmeans = SphericalKMeans(n_clusters=len(self.centers_[center_num]), 
                                                       init = self.centers_[center_num], 
                                                       max_iter=1, n_init=1, n_jobs=1).fit(self.embeddings_[indices])  
            speaker_labels = spkmeans.labels_+1 
        
        
        if len(self.speaker_labels_) == 0:
            raise RuntimeError("Clustering not performed.")
                                       
        # Compute TSNE only once
        if len(self.emb_2d_) == 0:
            
            print("Computing TSNE transform...")
            tsne = TSNE(n_jobs=4)
            self.emb_2d_ = tsne.fit_transform(self.embeddings_)
        
        
        # Visualize
        emb_2d = self.emb_2d_[indices]
        speaker_labels = speaker_labels.astype(np.int)
        speakers = np.unique(speaker_labels)
        colors=cm.rainbow(np.linspace(0,1,len(speakers)))
        plt.figure(figsize=(7,7))

        for speaker in speakers:

            speak_ind = np.where(speaker_labels == speaker)[0]
            x, y = np.transpose(emb_2d[speak_ind])
            if use_colors == True:
               plt.scatter(x, y, c="k", edgecolors=colors[speaker-1], s=2,  label=speaker)
            else:
               plt.scatter(x, y, c="k", edgecolors="k", s=2,  label=speaker)

        plt.legend(title = "Speakers", prop={'size': 10})

        if len(ref_labels) == 0:
            plt.title("Predicted speaker clusters")
        else:
            plt.title("Reference speaker clusters")  
        plt.show()
示例#19
0
def cluster_doc(doc_emb, K, method):
    y_pred = []
    if method == "kmeans":
        # k-means
        print("Clustering using K-Means")
        from sklearn.cluster import KMeans
        km = KMeans(n_clusters=K, n_init=1)
        km.fit(doc_emb)
        y_pred = km.labels_
    elif method == "skmeans":
        # spherical k-means
        print("Clustering using Spherical K-Means")
        from spherecluster import SphericalKMeans
        skm = SphericalKMeans(n_clusters=K, n_init=1)
        skm.fit(doc_emb)
        y_pred = skm.labels_
    return y_pred
示例#20
0
def silh_score(emb, guess, mode = 0):

        spkmeans = SphericalKMeans(n_clusters=guess, max_iter=300, n_init=1, n_jobs=1).fit(emb)
        emb_labels = spkmeans.labels_
        centers = spkmeans.cluster_centers_
        if mode == 0:
            return silhouette_score(emb, emb_labels, metric = "cosine"), emb_labels, centers
        else:
            return silhouette_score(emb, emb_labels, metric = "cosine")
示例#21
0
def SphericalKMeans_model(vocab_embeddings, vocab, topics, rerank, rand,
                          weights):
    spkmeans = SphericalKMeans(n_clusters=topics,
                               random_state=rand).fit(vocab_embeddings,
                                                      sample_weight=weights)
    m_clusters = spkmeans.predict(vocab_embeddings, sample_weight=weights)
    centers = np.array(spkmeans.cluster_centers_)

    indices = []

    for i in range(topics):
        topk_vals = sort_closest_cossine_center(centers[i], m_clusters,
                                                vocab_embeddings, i)
        if rerank:
            indices.append(find_top_k_words(100, topk_vals, vocab))
        else:
            indices.append(find_top_k_words(10, topk_vals, vocab))
        # print(indices)
    return m_clusters, indices
    def initialize(self):

        self.R12_train = np.multiply(NMTF2.R12, self.M)
        """spherical k-means"""
        skm3 = SphericalKMeans(n_clusters=self.K[2])
        skm3.fit(NMTF2.R23)

        #Reload matrices that have already been used before
        self.G1 = NMTF1.G1
        self.G2 = NMTF1.G2
        self.G3 = skm3.cluster_centers_.transpose()

        self.S12 = np.linalg.multi_dot(
            [self.G1.transpose(), self.R12_train, self.G2])
        self.S23 = np.linalg.multi_dot(
            [self.G2.transpose(), NMTF2.R23, self.G3])

        #Save G3 for the next models
        NMTF2.G3 = self.G3
示例#23
0
    def semantic_sim_driver(self,time_mapping,log_filename = "yao_test1.txt",):

        df = pd.read_csv("eval/yao/testset_1.csv")


        try:
            df.real_year = df.year.apply(lambda x: int(time_mapping[str(x)]))

        except Exception as  e:
            print(e)
            print(time_mapping.keys())
            print(df.year.unique())
            df.real_year = df.year.apply(lambda x: int(time_mapping[str(x // 10 * 10) + "s"]))

        labels = set(df.label.unique())
        labels_mapping =  { label : index  for index,label in enumerate(labels) }
        df.label_id = df.label.apply(lambda  x: labels_mapping[x])
        # print(df.label_id)

        embeddings,known_index = self.get_embedding_in_a_year(df.word,df.real_year.tolist(),return_known_index =True)

        from spherecluster import SphericalKMeans

        scores = []
        for n in [10,15,20]:
            skm = SphericalKMeans(n_clusters = n)
            skm.fit(embeddings)
            # print(skm.labels_.shape)
            # print(len(df.label_id[known_index]))
            # print(sum(known_index))
            score = get_score(skm.labels_,df.label_id[known_index])
            score1 = get_score1(skm.labels_,df.label_id[known_index])
            scores.append(score)
            scores.append(score1)

        print(scores)

        with open(log_filename, "w", encoding="utf-8") as f:
            line = "\t".join(["{0:.4f}".format(s) for s  in scores]) + "\n"
            print(line)
            f.write(line)

        return None
示例#24
0
def get_topic_vecs(model, n_topics=20):
    """ Computes and returns the topic vectors of a doc2vec model. the topic
        vectors are simply the centroids of the classes after the documents
        have been clustered. They are therefore "virtual" documents that are
        an average of a group of similar documents.
        Arguments:
            - (gensim.models.doc2vec.Doc2Vec) model: A doc2vec model
            - (<float>) n_topics: The number of topics that should be
                found, defaults to 20.
        Returns:
            - (numpy.ndarray) topics: The topic vectors of the model
    """
    from spherecluster import SphericalKMeans
    skm = SphericalKMeans(n_clusters=n_topics)
    # getting the data as a numpy array
    dv = model.docvecs.vectors_docs
    # carrying out K-means to group documents by topic
    skm.fit(dv)
    # extracting topic vectors (centroids of the groups)
    return skm.cluster_centers_
示例#25
0
def Silhouette(X, seguradora):
    insurance_label = dbm.GetAccountLabel(seguradora)
    maxx = len(X)

    if maxx > 11:
        maxx = 11

    range_of_clusters = list(range(2, maxx))
    clusters_silhouette = dict()

    for n_clusters in range_of_clusters:
        # Initialize the clusterer with n_clusters value
        #...and a random generator
        # seed of 10 for reproducibility.
        clusterer = SKMeans(n_clusters=n_clusters, random_state=0)
        cluster_labels = clusterer.fit_predict(X)

        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation
        #...of the formed
        # clusters
        silhouette_avg = silhouette_score(X, cluster_labels)

        clusters_silhouette.update({n_clusters: silhouette_avg})

        # Compute the silhouette scores for each sample
        sample_silhouette_values = silhouette_samples(X, cluster_labels)

    plt.title('Silhueta media de %s' % insurance_label)
    plt.xlabel('Numero de clusters', fontsize=16)
    plt.ylabel("Silhueta media", fontsize=16)
    plt.plot(clusters_silhouette.keys(), clusters_silhouette.values())
    plt.savefig("../analytics/%s/%s_silhuette.png" \
        % (insurance_label, insurance_label))
    plt.close()

    silhouettes = [v for v in clusters_silhouette.values()]

    for k, v in clusters_silhouette.iteritems():
        if max(silhouettes) == v:
            return k
    def initialize(self):

        self.R12_train = np.multiply(NMTF5.R12, self.M)
        """spherical k-means"""
        skm5 = SphericalKMeans(n_clusters=self.K[4])
        skm5.fit(NMTF5.R25)

        self.G1 = NMTF1.G1
        self.G2 = NMTF1.G2
        self.G3 = NMTF2.G3
        self.G4 = NMTF3.G4
        self.G5 = skm5.cluster_centers_.transpose()

        self.S12 = np.linalg.multi_dot(
            [self.G1.transpose(), self.R12_train, self.G2])
        self.S23 = np.linalg.multi_dot(
            [self.G2.transpose(), NMTF5.R23, self.G3])
        self.S34 = np.linalg.multi_dot(
            [self.G3.transpose(), NMTF5.R34, self.G4])
        self.S25 = np.linalg.multi_dot(
            [self.G2.transpose(), NMTF5.R25, self.G5])
示例#27
0
    def score_embeddings(self, min_length, max_num_speakers):
        """ Score embeddings.

        Args:
            min_length (int): minimal length of segment used for clustering in miliseconds
            max_num_speakers (int): maximal number of speakers

        Returns:
            dict: dictionary with scores for each file
        """
        scores_dict = {}
        logger.info('Scoring using `{}`.'.format('PLDA' if self.plda is not None else 'cosine distance'))
        for embedding_set in self.embeddings:
            name = os.path.normpath(embedding_set.name)
            embeddings_all = embedding_set.get_all_embeddings()
            embeddings_long = embedding_set.get_longer_embeddings(min_length)
            if len(embeddings_long) == 0:
                logger.warning(
                    'No embeddings found longer than {} for embedding set `{}`.'.format(min_length, name))
                continue
            size = len(embedding_set)
            if size > 0:
                logger.info('Clustering `{}` using {} long embeddings.'.format(name, len(embeddings_long)))
                if embedding_set.num_speakers is not None:
                    num_speakers = embedding_set.num_speakers
                    if self.use_l2_norm:
                        kmeans_clustering = SphericalKMeans(
                            n_clusters=num_speakers, n_init=1000, n_jobs=1).fit(embeddings_long)
                    else:
                        kmeans_clustering = sklearnKMeans(
                            n_clusters=num_speakers, n_init=1000, n_jobs=1).fit(embeddings_long)
                    if self.plda is None:
                        centroids = kmeans_clustering.cluster_centers_
                    else:
                        centroids = PLDAKMeans(
                            kmeans_clustering.cluster_centers_, num_speakers, self.plda).fit(embeddings_long)
                else:
                    xm = xmeans(embeddings_long, kmax=max_num_speakers)
                    xm.process()
                    num_speakers = len(xm.get_clusters())
                    kmeans_clustering = sklearnKMeans(
                        n_clusters=num_speakers, n_init=100, n_jobs=1).fit(embeddings_long)
                    centroids = kmeans_clustering.cluster_centers_
                if self.norm is None:
                    if self.plda is None:
                        scores_dict[name] = cosine_similarity(embeddings_all, centroids).T
                    else:
                        scores_dict[name] = self.plda.score(embeddings_all, centroids)
                else:
                    scores_dict[name] = self.norm.s_norm(embeddings_all, centroids)
            else:
                logger.warning('No embeddings to score in `{}`.'.format(embedding_set.name))
        return scores_dict
示例#28
0
 def cluster_test(self, test_file, clusters=10):
     df_test1 = pd.read_csv(test_file)
     output = {}
     for K in clusters:
         vectors = list()
         y_true = list()
         sections = dict()
         idx = 0
         for word, section, y in df_test1.values:
             sliceIdx = self.yearDict[str(y)]
             if word in self.vocabularies[sliceIdx]:
                 if section not in sections:
                     sections[section] = idx
                     idx += 1
                 y_true.append(sections[section])
                 vectors.append(self.matrices_norm[sliceIdx][
                     self.vocabularies[sliceIdx][word]])
         skm = SphericalKMeans(n_clusters=K, max_iter=100000)
         skm.fit(np.array(vectors))
         metric = normalized_mutual_info_score(skm.predict(
             np.array(vectors)),
                                               y_true,
                                               average_method='arithmetic')
         y_true_bool = [(triplet1 == triplet2) for triplet2 in y_true
                        for triplet1 in y_true]
         y_pred = skm.predict(np.array(vectors))
         y_pred_bool = [(triplet1 == triplet2) for triplet2 in y_pred
                        for triplet1 in y_pred]
         metric2 = fbeta_score(y_true_bool, y_pred_bool, beta=5)
         output[f'NMI({K})'] = metric
         output[f'F_beta-score({K})'] = metric2
     return output
示例#29
0
def sphericalKMeans(num):
    num = 4
    file = open("Kmeans.txt", "a+")
    input = np.array([[-4, -2], [-3, -2], [-2, -2], [-1, -2], [1, -1], [1, 1],
                      [2, 3], [3, 2], [3, 4], [4, 3]])
    kmeans = SphericalKMeans(n_clusters=num).fit(input)
    file.write("Spherical K means output for cluster size : " + str(num) +
               "\n")
    file.write("Clusters index of points" + "\n")
    file.write(str(kmeans.labels_) + "\n")
    file.write("Center of Clusters\n")
    file.write(str(kmeans.cluster_centers_) + "\n")
    file.close()
def analyse(methode,
            preproc,
            true_label,
            nb_clusters=3,
            normalizer=True,
            scikit=True):
    if scikit:
        data = methode.fit_transform(preproc)
    else:
        data = preproc
    if normalizer:
        data = Normalizer(norm='l2', copy=False).fit_transform(data)
    skplt.cluster.plot_elbow_curve(SphericalKMeans(random_state=42, n_jobs=-1),
                                   data,
                                   title="Elbow Curve avec Spherical K-means",
                                   cluster_ranges=range(1, 15))
    skplt.cluster.plot_elbow_curve(KMeans(random_state=42,
                                          n_jobs=-1,
                                          precompute_distances=True),
                                   data,
                                   title="Elbow Curve avec K-means",
                                   cluster_ranges=range(1, 15))
    ("Fitting For Spherical K-means for ", nb_clusters, "...")
    skmeans = SphericalKMeans(n_clusters=nb_clusters,
                              random_state=42,
                              n_jobs=-1).fit(data)
    ("Fitting For Spherical K-means for ", nb_clusters, "...")
    kmeans = KMeans(n_clusters=nb_clusters,
                    random_state=42,
                    n_jobs=-1,
                    precompute_distances=True).fit(data)
    y_pred_skmeans = skmeans.predict(data)
    y_pred_kmeans = kmeans.predict(data)
    print("Results from Spherical K-means")
    scoring_cluster(skmeans, true_label, y_pred_skmeans)
    print("Results from K-means")
    scoring_cluster(kmeans, true_label, y_pred_kmeans)
    return methode, skmeans, kmeans, data