def run_gaussian_initial_starting_points(self, nb_clusters, src_file, data_without_target, dataset_name, initial_clusters_file, initial_clusters, run_number, run_info=None): import tensorflow as tf output_file, centroids_file = self._prepare_files(dataset_name, run_info, True) if self.seed is not None: tf.set_random_seed(self.seed) points = data_without_target.values def get_input_fn(): def input_fn(): return constant_op.constant(points.astype(np.float32)), None return input_fn gmm = tf.contrib.factorization.GMM(num_clusters=nb_clusters, initial_clusters=initial_clusters) gmm.fit(input_fn=get_input_fn(), steps=1) cluster_indices = list(gmm.predict_assignments()) ClusteringToolkit._save_clustering(TensorFlow._clustering_to_list(points, cluster_indices), output_file) ClusteringToolkit._save_centroids(TensorFlow._centroids_to_list(gmm), centroids_file) return output_file, {"centroids": centroids_file}
def run_kmeans_base(self, nb_clusters, src_file, data_without_target, dataset_name, run_number, config_function, run_info=None, nb_iterations=None): self._init() output_file, centroids_file = self._prepare_files( dataset_name, run_info, True) train_features = shogun.RealFeatures( data_without_target.values.astype("float64").transpose()) # distance metric over feature matrix - Euclidean distance distance = shogun.EuclideanDistance(train_features, train_features) # KMeans object created kmeans = shogun.KMeans(nb_clusters, distance) if config_function is not None: config_function(kmeans) if nb_iterations is not None: kmeans.set_max_iter(nb_iterations) centers, result = Shogun._kmeans_process(kmeans) ClusteringToolkit._save_clustering( Shogun._clustering_to_list(data_without_target, result), output_file) ClusteringToolkit._save_centroids(Shogun._centroids_to_list(centers), centroids_file) return output_file, {"centroids": centroids_file}
def run_meanshift(self, nb_clusters, src_file, data_without_target, dataset_name, run_number, run_info=None): self._init() output_file, centroids_file = self._prepare_files(dataset_name, run_info, True) # Create model. model = sklearn.cluster.MeanShift() model.fit(data_without_target) self._save_clustering(self._clustering_to_list(data_without_target, model.labels_), output_file) ClusteringToolkit._save_centroids(self._centroids_to_list(model), centroids_file) return output_file, {"centroids": centroids_file}
def run_kmeans(self, nb_clusters, src_file, data_without_target, dataset_name, initial_clusters_file, initial_clusters, run_number, run_info=None, nb_iterations=None): import tensorflow as tf output_file, centroids_file = self._prepare_files(dataset_name, run_info, True) if self.seed is not None: tf.set_random_seed(self.seed) kmeans = tf.contrib.factorization.KMeansClustering(num_clusters=nb_clusters, initial_clusters=initial_clusters, use_mini_batch=False) points, input_fn = TensorFlow._build_points_and_input_fn(data_without_target) TensorFlow._train_kpp(input_fn, kmeans, 10 if nb_iterations is None else nb_iterations) cluster_indices = list(kmeans.predict_cluster_index(input_fn)) ClusteringToolkit._save_clustering(TensorFlow._clustering_to_list(points, cluster_indices), output_file) ClusteringToolkit._save_centroids(TensorFlow._centroids_to_list(kmeans), centroids_file) return output_file, {"centroids": centroids_file}
def base_kmeans_specified_init(self, nb_clusters, src_file, data_without_target, dataset_name, run_number, init, run_info=None, nb_iterations=None): self._init() output_file, centroids_file = self._prepare_files(dataset_name, run_info, True) # Create a KMean model. params = {"n_clusters": nb_clusters, "init": init, "n_init": 1} if self.tolerance is not None: params['tol'] = self.tolerance if nb_iterations is not None: params['max_iter'] = nb_iterations sklearn_kmean_model = sklearn.cluster.KMeans(**params) sklearn_kmean_model.fit(data_without_target) ClusteringToolkit._save_clustering(self._clustering_to_list(data_without_target, sklearn_kmean_model.labels_), output_file) ClusteringToolkit._save_centroids(self._centroids_to_list(sklearn_kmean_model), centroids_file) return output_file, {"centroids": centroids_file}
def read_or_draw_centroids(dataset_name, run_info, nb_clusters, data, redirect_output=None): drawn_clusters_file_path = ClusteringToolkit.dataset_out_file_name_static( dataset_name, "{}.init_set_clusters".format(run_info)) if redirect_output is not None: base_name = os.path.basename(drawn_clusters_file_path) drawn_clusters_file_path = os.path.join(redirect_output, base_name) if not os.path.exists(drawn_clusters_file_path): # Lets draw a random feature set on EACH feature (this will be the starting point for *ALL* algorithms) initial_clusters = draw_centroids(nb_clusters, data, drawn_clusters_file_path) else: # Reread to get float32 type (required by TF) initial_clusters = read_centroids_file(drawn_clusters_file_path) return drawn_clusters_file_path, initial_clusters
def _save_run(ret, data_without_target, output_file, centroids_file): ClusteringToolkit._save_clustering(OpenCV._clustering_to_list(data_without_target, ret[1]), output_file) ClusteringToolkit._save_centroids(OpenCV._centroids_to_list(ret[2]), centroids_file)