def _train_clustering(point_set, distance_name, k): labels = np.array([0]*len(point_set)) features = np.zeros((2, len(point_set))) for i in xrange(len(point_set)): features[0, i] = point_set[i]['x'] features[1, i] = point_set[i]['y'] labels[i] = point_set[i]['label'] lab = sg.BinaryLabels(labels) train = sg.RealFeatures(features) if distance_name == "EuclideanDistance": distance = sg.EuclideanDistance(train, train) elif distance_name == "ManhattanMetric": distance = sg.ManhattanMetric(train, train) elif distance_name == "JensenMetric": distance = sg.JensenMetric(train, train) else: raise TypeError kmeans = sg.KMeans(k, distance) kmeans.train() return kmeans
def shogunProcess(clustersNumber, dataLessTarget, datasetName, runinfo = None, initialClusters = None): import shogun outputFile = datasetOutFile(datasetName, SHOGUN_ALGO, runinfo=runinfo) if os.path.exists(outputFile): print("shogun skipped") return train_features = shogun.RealFeatures(dataLessTarget.values.astype("float64").transpose()) # distance metric over feature matrix - Euclidean distance distance = shogun.EuclideanDistance(train_features, train_features) hierarchical = shogun.Hierarchical(clustersNumber, distance) #TODO Makes the pyhon process dies!!!???!!! d = hierarchical.get_merge_distances() cp = hierarchical.get_cluster_pairs() with open(outputFile, 'w') as csvfile: filewriter = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL) for index, row in dataLessTarget.iterrows(): filewriter.writerow([index, result[index].item(0)])
def run_kmeans_base(self, nb_clusters, src_file, data_without_target, dataset_name, run_number, config_function, run_info=None, nb_iterations=None): self._init() output_file, centroids_file = self._prepare_files( dataset_name, run_info, True) train_features = shogun.RealFeatures( data_without_target.values.astype("float64").transpose()) # distance metric over feature matrix - Euclidean distance distance = shogun.EuclideanDistance(train_features, train_features) # KMeans object created kmeans = shogun.KMeans(nb_clusters, distance) if config_function is not None: config_function(kmeans) if nb_iterations is not None: kmeans.set_max_iter(nb_iterations) centers, result = Shogun._kmeans_process(kmeans) ClusteringToolkit._save_clustering( Shogun._clustering_to_list(data_without_target, result), output_file) ClusteringToolkit._save_centroids(Shogun._centroids_to_list(centers), centroids_file) return output_file, {"centroids": centroids_file}
def shogunProcess(clustersNumber, dataLessTarget, datasetName, runinfo=None, initialClusters=None): import shogun outputFile = datasetOutFile(datasetName, SHOGUN_ALGO, runinfo=runinfo) clustersOutputFile = datasetOutFile(datasetName, centroidFor(SHOGUN_ALGO), runinfo=runinfo) if os.path.exists(outputFile) and os.path.exists(clustersOutputFile): print("shogun skipped") return train_features = shogun.RealFeatures( dataLessTarget.values.astype("float64").transpose()) # distance metric over feature matrix - Euclidean distance distance = shogun.EuclideanDistance(train_features, train_features) # KMeans object created kmeans = shogun.KMeans(clustersNumber, distance) if initialClusters is None: # set KMeans++ flag kmeans.set_use_kmeanspp(True) else: # set new initial centers kmeans.set_initial_centers( initialClusters.astype("float64").transpose()) # KMeans training kmeans.train() # cluster centers centers = kmeans.get_cluster_centers() # Labels for data points result = kmeans.apply() with open(outputFile, 'w') as csvfile: filewriter = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL) for index, row in dataLessTarget.iterrows(): filewriter.writerow([index, result[index].item(0)]) with open(clustersOutputFile, 'w') as clusterFile: filewriter = csv.writer(clusterFile, quoting=csv.QUOTE_MINIMAL) for row in centers.transpose(): filewriter.writerow(row.tolist())
def run_hierarchical(self, nb_clusters, src_file, data_without_target, dataset_name, run_number, run_info=None): output_file, = self._prepare_files(dataset_name, run_info, False) train_features = shogun.RealFeatures( data_without_target.values.astype("float64").transpose()) # distance metric over feature matrix - Euclidean distance distance = shogun.EuclideanDistance(train_features, train_features) hierarchical = shogun.Hierarchical(nb_clusters, distance)
# load data feature_matrix = data.swissroll() # create features instance features = sg.RealFeatures(feature_matrix) # create Diffusion Maps converter instance converter = sg.DiffusionMaps() # set target dimensionality converter.set_target_dim(2) # set number of time-steps converter.set_t(2) # set width of gaussian kernel converter.set_width(10.0) # create euclidean distance instance distance = sg.EuclideanDistance() # enable converter instance to use created distance instance converter.set_distance(distance) # compute embedding with Diffusion Maps method embedding = converter.embed(features) # compute custom distance matrix distance_matrix = np.exp(-np.dot(feature_matrix.T,feature_matrix)) # create Custom Kernel instance custom_distance = sg.CustomDistance(distance_matrix) # construct embedding based on created distance distance_embedding = converter.embed_distance(custom_distance)