def get_km(xs, n): km = KMedoids(n_cluster=n, max_iter=1000, tol=1e-5) km.fit(xs) kmidx = list(km.medoids) testidx = [i for i in range(len(xs)) if i not in kmidx] np.random.shuffle(testidx) return kmidx, testidx[:100]
def run_kmedoids(self, params): """ Performs clustering using the k-medoids algorithm. :type params: dictionary :param params: {'k','t_max','init','criterion'} """ self.n_clusters_ = params['k'] kmedoids = KMedoids(n_clusters=params['k'], max_iter=params['t_max'], init=params['init'], criterion=params['criterion']).fit(self.dist_mat) self.centers_l = kmedoids.cluster_centers_ self.labels_l = kmedoids.labels_ self.form_kmedoids_results()
def fit(self, X, y, **kwargs): if self.verbose: print("fitting", self) clss = sorted(set(y)) meds = [] for c, k in enumerate(clss): idxs_c = np.where(y == k)[0] kmpp_idxs = kmeanspp(X[idxs_c], self.mk, seed=self.seed) kmed = KMedoids(self.mk, init=kmpp_idxs) kmed.fit(X[idxs_c], dist=False) meds.append(idxs_c[kmed.medoids].tolist()) self.idxs = np.concatenate(meds, axis=0) self.vecs = X[self.idxs] cc = Counter(y[self.idxs]) assert set(cc.values()) == set({self.mk}) if self.verbose: print("fitted KMedoids seed={},".format(self.seed), cc) self.clf.fit(self.vecs, y[self.idxs]) return self
} else: raise Exception(f"Not recognized dataset: {args.dataset}") if args.dis == "euclidean": make_pretrainer = lambda: KMeans(n_clusters=n_clusters) dis = DMAE.Dissimilarities.euclidean dis_loss = DMAE.Losses.euclidean_loss init_dmae = lambda pretrainer: { "centers": DMAE.Initializers.InitKMeans(pretrainer), "mixers": tf.keras.initializers.Constant(1.0) } cov = False elif args.dis == "cosine": make_pretrainer = lambda: KMedoids(n_clusters=n_clusters, metric="cosine") dis = DMAE.Dissimilarities.cosine dis_loss = DMAE.Losses.cosine_loss init_dmae = lambda pretrainer: { "centers": DMAE.Initializers.InitKMeans(pretrainer), "mixers": tf.keras.initializers.Constant(1.0) } cov = False elif args.dis == "manhattan": make_pretrainer = lambda: KMedoids(n_clusters=n_clusters, metric="manhattan") dis = DMAE.Dissimilarities.manhattan dis_loss = DMAE.Losses.manhattan_loss init_dmae = lambda pretrainer: { "centers": DMAE.Initializers.InitKMeans(pretrainer),
def experiments(PORCENTAJE_VECINOS, ALGORITHM, MODELO, normalizar=None): vecinos = algorithms[ALGORITHM] algoritmos = "coseno" if PORCENTAJE_VECINOS in ["boost", "maxsim", "dist"]: algoritmos = ALGORITHM + "-" + PORCENTAJE_VECINOS elif PORCENTAJE_VECINOS != 0: algoritmos = "%s-%.1f" % (ALGORITHM, PORCENTAJE_VECINOS) titulo = MODELO + "-" + algoritmos if normalizar is not None: titulo += "-" + normalizar fname = sys.argv[2] + "/" + titulo + ".out" if os.path.isfile(fname): return print(titulo) print("-" * 20) if PORCENTAJE_VECINOS == 0: X = coseno if MODELO == "dbscan": # Solo sirve para coseno! X = 1 - X else: neighbour_file_name = sys.argv[2] + "/" + ALGORITHM + ".npy" if os.path.isfile(neighbour_file_name): NEIGHBOURS = np.load(neighbour_file_name) else: print("Calculando vecinos") NEIGHBOURS = np.zeros((len(service_number), len(service_number))) for i in range(0, len(service_number)): for j in range(i, len(service_number)): NEIGHBOURS[i][j] = vecinos(followers, users, i, j) if i != j: NEIGHBOURS[j][i] = NEIGHBOURS[i][j] np.save(neighbour_file_name, NEIGHBOURS) if normalizar is not None: print("Normalizando Vecinos") if normalizar == 'minmax': NEIGHBOURS = preprocessing.minmax_scale(NEIGHBOURS) elif normalizar == 'scale': NEIGHBOURS = preprocessing.scale(NEIGHBOURS) elif normalizar == 'robust': NEIGHBOURS = preprocessing.robust_scale(NEIGHBOURS) elif normalizar == 'softmax': NEIGHBOURS = np.exp(NEIGHBOURS) / np.sum(np.exp(NEIGHBOURS), axis=1, keepdims=True) elif normalizar == 'matrixminmax': NEIGHBOURS = (NEIGHBOURS - np.min(NEIGHBOURS)) / (np.max(NEIGHBOURS) - np.min(NEIGHBOURS)) elif normalizar == 'matrixmax': NEIGHBOURS = NEIGHBOURS / np.max(NEIGHBOURS) if MODELO == "dbscan": # Si es distancia if normalizar is not None: NEIGHBOURS = 1 - NEIGHBOURS else: NEIGHBOURS = - NEIGHBOURS X = (1 - PORCENTAJE_VECINOS) * (1 - coseno) + PORCENTAJE_VECINOS * NEIGHBOURS else: # Si es afinidad if PORCENTAJE_VECINOS == "boost": X = np.multiply(coseno, NEIGHBOURS) elif PORCENTAJE_VECINOS == "maxsim": X = np.maximum(coseno, NEIGHBOURS) elif PORCENTAJE_VECINOS == "dist": NEIGHBOURS_SORTED = np.argsort(np.argsort(NEIGHBOURS)) COSINE_SORTED = np.argsort(np.argsort(coseno)) POS_BOOST = np.log(1 / (1 + np.abs(NEIGHBOURS_SORTED - COSINE_SORTED))) X = POS_BOOST else: X = (1 - PORCENTAJE_VECINOS) * coseno + PORCENTAJE_VECINOS * NEIGHBOURS print("Generando Modelo") if MODELO == 'kmedoids': model = KMedoids(n_clusters=1500).fit(X) if MODELO == 'kmedoids470': model = KMedoids(n_clusters=470).fit(X) elif MODELO == 'ap': model = AffinityPropagation(affinity='precomputed').fit(X) elif MODELO == 'dbscan': model = DBSCAN(metric='precomputed').fit(X) labels = model.labels_ clusters = defaultdict(list) for index, classif in enumerate(labels): clusters[classif].append(index) n_clusters_ = len(clusters) info = "" info += 'Clusters: %d\n' % n_clusters_ # info += 'Cohesiveness: %0.3f\n' % cohesiveness(X, labels) info += 'Entropy: %0.3f\n' % entropy(labels_true, labels) info += "Homogeneity: %0.3f\n" % metrics.homogeneity_score(labels_true, labels) info += "Completeness: %0.3f\n" % metrics.completeness_score(labels_true, labels) info += "V-measure: %0.3f\n" % metrics.v_measure_score(labels_true, labels) info += 'Purity: %0.3f\n' % purity(labels_true, labels) info += "F-Measure: %0.3f\n" % fmeasure(labels_true, labels) info += "Adjusted Rand Index: %0.3f\n" % metrics.adjusted_rand_score(labels_true, labels) info += "Adjusted Mutual Information: %0.3f\n" % metrics.adjusted_mutual_info_score(labels_true, labels) clustersize = Counter(labels) salida = open(fname, 'w', encoding='UTF-8') print(info) salida.write(titulo + "\n") for cluster, services in clusters.items(): countcat = Counter([labels_true[svc] for svc in services]) max_key, num = countcat.most_common(1)[0] salida.write("%i (%s - %i/%i): %s \n" % ( cluster, max_key, num, clustersize[cluster], ",".join([service_list[svc] for svc in services]))) salida.write("-" * 20 + "\n") salida.write(info) salida.close()